summaryrefslogtreecommitdiffstats
path: root/xlators/cluster
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster')
-rw-r--r--xlators/cluster/Makefile.am3
-rw-r--r--xlators/cluster/afr/Makefile.am3
-rw-r--r--xlators/cluster/afr/src/Makefile.am20
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c345
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.h47
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.c1786
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.h59
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.c721
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.h47
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c2024
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.h63
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c1073
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.h66
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c1030
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c2038
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c791
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h52
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c957
-rw-r--r--xlators/cluster/afr/src/afr-transaction.h36
-rw-r--r--xlators/cluster/afr/src/afr.c2338
-rw-r--r--xlators/cluster/afr/src/afr.h523
-rw-r--r--xlators/cluster/dht/Makefile.am1
-rw-r--r--xlators/cluster/dht/src/Makefile.am30
-rw-r--r--xlators/cluster/dht/src/dht-common.c3470
-rw-r--r--xlators/cluster/dht/src/dht-common.h212
-rw-r--r--xlators/cluster/dht/src/dht-hashfn-tea.c146
-rw-r--r--xlators/cluster/dht/src/dht-hashfn.c88
-rw-r--r--xlators/cluster/dht/src/dht-helper.c326
-rw-r--r--xlators/cluster/dht/src/dht-layout.c543
-rw-r--r--xlators/cluster/dht/src/dht-linkfile.c224
-rw-r--r--xlators/cluster/dht/src/dht-rename.c562
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c460
-rw-r--r--xlators/cluster/dht/src/dht.c222
-rw-r--r--xlators/cluster/dht/src/nufa.c684
-rw-r--r--xlators/cluster/ha/Makefile.am3
-rw-r--r--xlators/cluster/ha/src/Makefile.am15
-rw-r--r--xlators/cluster/ha/src/ha-helpers.c191
-rw-r--r--xlators/cluster/ha/src/ha.c3479
-rw-r--r--xlators/cluster/ha/src/ha.h59
-rw-r--r--xlators/cluster/map/Makefile.am3
-rw-r--r--xlators/cluster/map/src/Makefile.am15
-rw-r--r--xlators/cluster/map/src/map-helper.c357
-rw-r--r--xlators/cluster/map/src/map.c2193
-rw-r--r--xlators/cluster/map/src/map.h76
-rw-r--r--xlators/cluster/stripe/Makefile.am3
-rw-r--r--xlators/cluster/stripe/src/Makefile.am14
-rw-r--r--xlators/cluster/stripe/src/stripe.c3286
-rw-r--r--xlators/cluster/unify/Makefile.am3
-rw-r--r--xlators/cluster/unify/src/Makefile.am16
-rw-r--r--xlators/cluster/unify/src/unify-self-heal.c1225
-rw-r--r--xlators/cluster/unify/src/unify.c4451
-rw-r--r--xlators/cluster/unify/src/unify.h132
52 files changed, 36511 insertions, 0 deletions
diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am
new file mode 100644
index 000000000..a6ddb3564
--- /dev/null
+++ b/xlators/cluster/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = unify stripe afr dht ha map
+
+CLEANFILES =
diff --git a/xlators/cluster/afr/Makefile.am b/xlators/cluster/afr/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/cluster/afr/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am
new file mode 100644
index 000000000..1bde9e5ba
--- /dev/null
+++ b/xlators/cluster/afr/src/Makefile.am
@@ -0,0 +1,20 @@
+xlator_LTLIBRARIES = afr.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+afr_la_LDFLAGS = -module -avoidversion
+
+afr_la_SOURCES = afr.c afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c
+afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES =
+
+uninstall-local:
+ rm -f $(DESTDIR)$(xlatordir)/replicate.so
+
+install-data-hook:
+ ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so \ No newline at end of file
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
new file mode 100644
index 000000000..0c65ca852
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -0,0 +1,345 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+
+#include "afr.h"
+
+
+int32_t
+afr_opendir_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ fd_t *fd)
+{
+ afr_local_t * local = NULL;
+
+ int call_count = -1;
+
+ LOCK (&frame->lock);
+ {
+ local = frame->local;
+
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ AFR_STACK_UNWIND (frame, local->op_ret,
+ local->op_errno, local->fd);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_opendir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int child_count = 0;
+ int i = 0;
+
+ int ret = -1;
+ int call_count = -1;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ child_count = priv->child_count;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+ local->fd = fd_ref (fd);
+
+ call_count = local->call_count;
+
+ for (i = 0; i < child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_opendir_cbk,
+ priv->children[i],
+ priv->children[i]->fops->opendir,
+ loc, fd);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, fd);
+ }
+
+ return 0;
+}
+
+
+/**
+ * Common algorithm for directory read calls:
+ *
+ * - Try the fop on the first child that is up
+ * - if we have failed due to ENOTCONN:
+ * try the next child
+ *
+ * Applicable to: readdir
+ */
+
+int32_t
+afr_readdir_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ gf_dirent_t *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ last_tried = local->cont.readdir.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+
+ this_try = ++local->cont.readdir.last_tried;
+ unwind = 0;
+
+ STACK_WIND (frame, afr_readdir_cbk,
+ children[this_try],
+ children[this_try]->fops->readdir,
+ local->fd, local->cont.readdir.size,
+ local->cont.readdir.offset);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_readdir (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset)
+{
+ afr_private_t * priv = NULL;
+ xlator_t ** children = NULL;
+ int call_child = 0;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.readdir.last_tried = call_child;
+
+ local->fd = fd_ref (fd);
+ local->cont.readdir.size = size;
+ local->cont.readdir.offset = offset;
+
+ STACK_WIND (frame, afr_readdir_cbk,
+ children[call_child], children[call_child]->fops->readdir,
+ fd, size, offset);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_getdents_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dir_entry_t *entry, int32_t count)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ last_tried = local->cont.getdents.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+
+ this_try = ++local->cont.getdents.last_tried;
+ unwind = 0;
+
+ STACK_WIND (frame, afr_getdents_cbk,
+ children[this_try],
+ children[this_try]->fops->getdents,
+ local->fd, local->cont.getdents.size,
+ local->cont.getdents.offset, local->cont.getdents.flag);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, entry, count);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_getdents (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset, int32_t flag)
+{
+ afr_private_t * priv = NULL;
+ xlator_t ** children = NULL;
+ int call_child = 0;
+ afr_local_t *local = NULL;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.getdents.last_tried = call_child;
+
+ local->fd = fd_ref (fd);
+
+ local->cont.getdents.size = size;
+ local->cont.getdents.offset = offset;
+ local->cont.getdents.flag = flag;
+
+ frame->local = local;
+
+ STACK_WIND (frame, afr_getdents_cbk,
+ children[call_child], children[call_child]->fops->getdents,
+ fd, size, offset, flag);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+
diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h
new file mode 100644
index 000000000..172ec3c90
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-dir-read.h
@@ -0,0 +1,47 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __DIR_READ_H__
+#define __DIR_READ_H__
+
+
+int32_t
+afr_opendir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, fd_t *fd);
+
+int32_t
+afr_closedir (call_frame_t *frame, xlator_t *this,
+ fd_t *fd);
+
+int32_t
+afr_readdir (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset);
+
+
+int32_t
+afr_getdents (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset, int32_t flag);
+
+
+int32_t
+afr_checksum (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags);
+
+
+#endif /* __DIR_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
new file mode 100644
index 000000000..87a6e09b5
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -0,0 +1,1786 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+
+#include "afr.h"
+#include "afr-transaction.h"
+
+
+void
+afr_build_parent_loc (loc_t *parent, loc_t *child)
+{
+ char *tmp = NULL;
+
+ if (!child->parent) {
+ loc_copy (parent, child);
+ return;
+ }
+
+ tmp = strdup (child->path);
+ parent->path = strdup (dirname (tmp));
+ FREE (tmp);
+
+ parent->name = strrchr (parent->path, '/');
+ if (parent->name)
+ parent->name++;
+
+ parent->inode = inode_ref (child->parent);
+ parent->parent = inode_parent (parent->inode, 0, NULL);
+ parent->ino = parent->inode->ino;
+}
+
+
+/* {{{ create */
+
+int
+afr_create_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ local->cont.create.fd,
+ local->cont.create.inode,
+ &local->cont.create.buf);
+ return 0;
+}
+
+
+int
+afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ fd_t *fd, inode_t *inode, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+
+ if ((local->success_count == 0)
+ || (child_index == priv->read_child)) {
+ local->cont.create.buf = *buf;
+ local->cont.create.buf.st_ino =
+ afr_itransform (buf->st_ino,
+ priv->child_count,
+ child_index);
+ }
+ local->cont.create.inode = inode;
+
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_create_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_create_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->create,
+ &local->loc,
+ local->cont.create.flags,
+ local->cont.create.mode,
+ local->cont.create.fd);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_create_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode, fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->cont.create.flags = flags;
+ local->cont.create.mode = mode;
+ local->cont.create.fd = fd_ref (fd);
+
+ local->transaction.fop = afr_create_wind;
+ local->transaction.done = afr_create_done;
+ local->transaction.unwind = afr_create_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ mknod */
+
+int
+afr_mknod_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ local->cont.mknod.inode,
+ &local->cont.mknod.buf);
+ return 0;
+}
+
+
+int
+afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+
+ if ((local->success_count == 0)
+ || (child_index == priv->read_child)) {
+ local->cont.mknod.buf = *buf;
+ local->cont.mknod.buf.st_ino =
+ afr_itransform (buf->st_ino,
+ priv->child_count,
+ child_index);
+ }
+ local->cont.mknod.inode = inode;
+
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_mknod_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->mknod,
+ &local->loc, local->cont.mknod.mode,
+ local->cont.mknod.dev);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_mknod_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_mknod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, dev_t dev)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->cont.mknod.mode = mode;
+ local->cont.mknod.dev = dev;
+
+ local->transaction.fop = afr_mknod_wind;
+ local->transaction.done = afr_mknod_done;
+ local->transaction.unwind = afr_mknod_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ mkdir */
+
+
+int
+afr_mkdir_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ local->cont.mkdir.inode,
+ &local->cont.mkdir.buf);
+ return 0;
+}
+
+
+int
+afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+
+ if ((local->success_count == 0)
+ || (child_index == priv->read_child)) {
+ local->cont.mkdir.buf = *buf;
+ local->cont.mkdir.buf.st_ino =
+ afr_itransform (buf->st_ino, priv->child_count,
+ child_index);
+ }
+ local->cont.mkdir.inode = inode;
+
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_mkdir_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->mkdir,
+ &local->loc, local->cont.mkdir.mode);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_mkdir_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->cont.mkdir.mode = mode;
+
+ local->transaction.fop = afr_mkdir_wind;
+ local->transaction.done = afr_mkdir_done;
+ local->transaction.unwind = afr_mkdir_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ link */
+
+
+int
+afr_link_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.link.buf.st_ino = local->cont.link.ino;
+
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ local->cont.link.inode,
+ &local->cont.link.buf);
+ }
+
+ return 0;
+}
+
+
+int
+afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+
+ if ((local->success_count == 0)
+ || (child_index == priv->read_child)) {
+ local->cont.link.buf = *buf;
+ local->cont.link.buf.st_ino =
+ afr_itransform (buf->st_ino, priv->child_count,
+ child_index);
+ }
+ local->cont.link.inode = inode;
+
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_link_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->link,
+ &local->loc,
+ &local->newloc);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_link_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, oldloc);
+ loc_copy (&local->newloc, newloc);
+
+ local->cont.link.ino = oldloc->inode->ino;
+
+ local->transaction.fop = afr_link_wind;
+ local->transaction.done = afr_link_done;
+ local->transaction.unwind = afr_link_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, oldloc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (oldloc->path);
+ local->transaction.new_basename = AFR_BASENAME (newloc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ symlink */
+
+
+int
+afr_symlink_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ local->cont.symlink.inode,
+ &local->cont.symlink.buf);
+ return 0;
+}
+
+
+int
+afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+
+ if ((local->success_count == 0)
+ || (child_index == priv->read_child)) {
+ local->cont.symlink.buf = *buf;
+ local->cont.symlink.buf.st_ino =
+ afr_itransform (buf->st_ino, priv->child_count,
+ child_index);
+ }
+ local->cont.symlink.inode = inode;
+
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_symlink_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->symlink,
+ local->cont.symlink.linkpath,
+ &local->loc);
+
+ if (!--call_count)
+ break;
+
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_symlink_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkpath, loc_t *loc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->cont.symlink.ino = loc->inode->ino;
+ local->cont.symlink.linkpath = strdup (linkpath);
+
+ local->transaction.fop = afr_symlink_wind;
+ local->transaction.done = afr_symlink_done;
+ local->transaction.unwind = afr_symlink_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ rename */
+
+int
+afr_rename_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.rename.buf.st_ino = local->cont.rename.ino;
+
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.rename.buf);
+ }
+
+ return 0;
+}
+
+
+int
+afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if ((op_ret != -1) && (local->success_count == 0)) {
+ local->op_ret = op_ret;
+
+ if (buf) {
+ local->cont.rename.buf = *buf;
+ local->cont.rename.buf.st_ino =
+ afr_itransform (buf->st_ino, priv->child_count,
+ child_index);
+ }
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_rename_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_rename_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->rename,
+ &local->loc,
+ &local->newloc);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_rename_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, oldloc);
+ loc_copy (&local->newloc, newloc);
+
+ local->cont.rename.ino = oldloc->inode->ino;
+
+ local->transaction.fop = afr_rename_wind;
+ local->transaction.done = afr_rename_done;
+ local->transaction.unwind = afr_rename_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, oldloc);
+ afr_build_parent_loc (&local->transaction.new_parent_loc, newloc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (oldloc->path);
+ local->transaction.new_basename = AFR_BASENAME (newloc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ unlink */
+
+int
+afr_unlink_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_unlink_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->unlink,
+ &local->loc);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_unlink_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int32_t
+afr_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.fop = afr_unlink_wind;
+ local->transaction.done = afr_unlink_done;
+ local->transaction.unwind = afr_unlink_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ rmdir */
+
+
+
+int
+afr_rmdir_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count)
+ need_unwind = 1;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_rmdir_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->rmdir,
+ &local->loc);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_rmdir_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_rmdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.fop = afr_rmdir_wind;
+ local->transaction.done = afr_rmdir_done;
+ local->transaction.unwind = afr_rmdir_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ setdents */
+
+int32_t
+afr_setdents_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if ((op_ret != -1) && (local->success_count == 0)) {
+ local->op_ret = op_ret;
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_setdents_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_setdents_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->setdents,
+ local->fd, local->cont.setdents.flags,
+ local->cont.setdents.entries,
+ local->cont.setdents.count);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_setdents_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_setdents (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+
+ local->fd = fd_ref (fd);
+
+ local->cont.setdents.flags = flags;
+ local->cont.setdents.entries = entries;
+ local->cont.setdents.count = count;
+
+ local->transaction.fop = afr_setdents_wind;
+ local->transaction.done = afr_setdents_done;
+
+ local->transaction.basename = NULL;
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+/* }}} */
diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h
new file mode 100644
index 000000000..e6e8a5e79
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-dir-write.h
@@ -0,0 +1,59 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __DIR_WRITE_H__
+#define __DIR_WRITE_H__
+
+int32_t
+afr_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode, fd_t *fd);
+
+int32_t
+afr_mknod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, dev_t dev);
+
+int32_t
+afr_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode);
+
+int32_t
+afr_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc);
+
+int32_t
+afr_rmdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc);
+
+int32_t
+afr_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc);
+
+int32_t
+afr_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc);
+
+int32_t
+afr_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkpath, loc_t *oldloc);
+
+int32_t
+afr_setdents (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count);
+
+#endif /* __DIR_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
new file mode 100644
index 000000000..a6c99ec05
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -0,0 +1,721 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+
+#include "afr.h"
+
+
+/**
+ * Common algorithm for inode read calls:
+ *
+ * - Try the fop on the first child that is up
+ * - if we have failed due to ENOTCONN:
+ * try the next child
+ *
+ * Applicable to: access, stat, fstat, readlink, getxattr
+ */
+
+/* {{{ access */
+
+int32_t
+afr_access_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ last_tried = local->cont.access.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.access.last_tried;
+
+ unwind = 0;
+
+ STACK_WIND_COOKIE (frame, afr_access_cbk,
+ (void *) (long) this_try,
+ children[this_try],
+ children[this_try]->fops->access,
+ &local->loc, local->cont.access.mask);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_access (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t mask)
+{
+ afr_private_t * priv = NULL;
+ xlator_t ** children = NULL;
+ int call_child = 0;
+ afr_local_t *local = NULL;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.access.last_tried = call_child;
+ loc_copy (&local->loc, loc);
+ local->cont.access.mask = mask;
+
+ STACK_WIND_COOKIE (frame, afr_access_cbk,
+ (void *) (long) call_child,
+ children[call_child], children[call_child]->fops->access,
+ loc, mask);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ stat */
+
+int32_t
+afr_stat_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int deitransform_child = -1;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ deitransform_child = (long) cookie;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ retry:
+ last_tried = local->cont.stat.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.stat.last_tried;
+
+ if (this_try == deitransform_child) {
+ goto retry;
+ }
+
+ unwind = 0;
+
+ STACK_WIND_COOKIE (frame, afr_stat_cbk,
+ (void *) (long) deitransform_child,
+ children[this_try],
+ children[this_try]->fops->stat,
+ &local->loc);
+ }
+
+out:
+ if (unwind) {
+ if (op_ret != -1)
+ buf->st_ino = local->cont.stat.ino;
+
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_stat (call_frame_t *frame, xlator_t *this,
+ loc_t *loc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int call_child = 0;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ frame->local = local;
+
+ call_child = afr_deitransform (loc->inode->ino, priv->child_count);
+ loc_copy (&local->loc, loc);
+
+ /*
+ if stat fails from the deitranform'd child, we try
+ all children starting with the first one
+ */
+ local->cont.stat.last_tried = -1;
+ local->cont.stat.ino = loc->inode->ino;
+
+ STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child,
+ children[call_child],
+ children[call_child]->fops->stat,
+ loc);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ fstat */
+
+int32_t
+afr_fstat_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int deitransform_child = -1;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ deitransform_child = (long) cookie;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ retry:
+ last_tried = local->cont.fstat.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.fstat.last_tried;
+
+ if (this_try == deitransform_child) {
+ /*
+ skip the deitransform'd child since if we are here
+ we must have already tried that child
+ */
+ goto retry;
+ }
+
+
+ unwind = 0;
+
+ STACK_WIND_COOKIE (frame, afr_fstat_cbk,
+ (void *) (long) deitransform_child,
+ children[this_try],
+ children[this_try]->fops->fstat,
+ local->fd);
+ }
+
+out:
+ if (unwind) {
+ if (op_ret != -1)
+ buf->st_ino = local->cont.fstat.ino;
+
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_fstat (call_frame_t *frame, xlator_t *this,
+ fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int call_child = 0;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ frame->local = local;
+
+ VALIDATE_OR_GOTO (fd->inode, out);
+
+ call_child = afr_deitransform (fd->inode->ino, priv->child_count);
+
+ /*
+ if fstat fails from the deitranform'd child, we try
+ all children starting with the first one
+ */
+ local->cont.fstat.last_tried = -1;
+ local->cont.fstat.ino = fd->inode->ino;
+ local->fd = fd_ref (fd);
+
+ STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child,
+ children[call_child],
+ children[call_child]->fops->fstat,
+ fd);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ readlink */
+
+int32_t
+afr_readlink_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ const char *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ last_tried = local->cont.readlink.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.readlink.last_tried;
+
+ unwind = 0;
+ STACK_WIND_COOKIE (frame, afr_readlink_cbk,
+ (void *) (long) this_try,
+ children[this_try],
+ children[this_try]->fops->readlink,
+ &local->loc,
+ local->cont.readlink.size);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_readlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, size_t size)
+{
+ afr_private_t * priv = NULL;
+ xlator_t ** children = NULL;
+ int call_child = 0;
+ afr_local_t *local = NULL;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ frame->local = local;
+
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.readlink.last_tried = call_child;
+ loc_copy (&local->loc, loc);
+ local->cont.readlink.size = size;
+
+ STACK_WIND_COOKIE (frame, afr_readlink_cbk,
+ (void *) (long) call_child,
+ children[call_child], children[call_child]->fops->readlink,
+ loc, size);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ getxattr */
+
+int32_t
+afr_getxattr_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ last_tried = local->cont.getxattr.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.getxattr.last_tried;
+
+ unwind = 0;
+ STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
+ (void *) (long) this_try,
+ children[this_try],
+ children[this_try]->fops->getxattr,
+ &local->loc,
+ local->cont.getxattr.name);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, dict);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name)
+{
+ afr_private_t * priv = NULL;
+ xlator_t ** children = NULL;
+ int call_child = 0;
+ afr_local_t * local = NULL;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ frame->local = local;
+
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.getxattr.last_tried = call_child;
+ loc_copy (&local->loc, loc);
+ if (name)
+ local->cont.getxattr.name = strdup (name);
+
+ STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
+ (void *) (long) call_child,
+ children[call_child], children[call_child]->fops->getxattr,
+ loc, name);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ readv */
+
+/**
+ * read algorithm:
+ *
+ * if the user has specified a read subvolume, use it
+ * otherwise -
+ * use the inode number to hash it to one of the subvolumes, and
+ * read from there (to balance read load)
+ *
+ * if any of the above read's fail, try the children in sequence
+ * beginning at the beginning
+ */
+
+int32_t
+afr_readv_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct iovec *vector, int32_t count, struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ retry:
+ last_tried = local->cont.readv.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.readv.last_tried;
+
+ if (this_try == priv->read_child) {
+ /*
+ skip the read child since if we are here
+ we must have already tried that child
+ */
+ goto retry;
+ }
+
+ unwind = 0;
+
+ STACK_WIND_COOKIE (frame, afr_readv_cbk,
+ (void *) (long) this_try,
+ children[this_try],
+ children[this_try]->fops->readv,
+ local->fd, local->cont.readv.size,
+ local->cont.readv.offset);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, vector, count, buf);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int call_child = 0;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ priv = this->private;
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ frame->local = local;
+
+ if (priv->read_child != -1) {
+ call_child = priv->read_child;
+
+ /*
+ if read fails from the read child, we try
+ all children starting with the first one
+ */
+ local->cont.readv.last_tried = -1;
+ } else {
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.readv.last_tried = call_child;
+ }
+
+ local->fd = fd_ref (fd);
+
+ local->cont.readv.size = size;
+ local->cont.readv.offset = offset;
+
+ STACK_WIND_COOKIE (frame, afr_readv_cbk,
+ (void *) (long) call_child,
+ children[call_child],
+ children[call_child]->fops->readv,
+ fd, size, offset);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, 0, NULL);
+ }
+ return 0;
+}
+
+/* }}} */
diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h
new file mode 100644
index 000000000..6b3bd2da8
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-inode-read.h
@@ -0,0 +1,47 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __INODE_READ_H__
+#define __INODE_READ_H__
+
+int32_t
+afr_access (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t mask);
+
+int32_t
+afr_stat (call_frame_t *frame, xlator_t *this,
+ loc_t *loc);
+
+int32_t
+afr_fstat (call_frame_t *frame, xlator_t *this,
+ fd_t *fd);
+
+int32_t
+afr_readlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, size_t size);
+
+int32_t
+afr_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset);
+
+int32_t
+afr_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name);
+
+#endif /* __INODE_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
new file mode 100644
index 000000000..267350b2c
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -0,0 +1,2024 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+
+#include "afr.h"
+#include "afr-transaction.h"
+
+
+/* {{{ chmod */
+
+
+int
+afr_chmod_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.chmod.buf.st_ino = local->cont.chmod.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.chmod.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_chmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.chmod.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ afr_chmod_unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_chmod_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int i = 0;
+ int call_count = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_chmod_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->chmod,
+ &local->loc,
+ local->cont.chmod.mode);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_chmod_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int32_t
+afr_chmod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->cont.chmod.mode = mode;
+ local->cont.chmod.ino = loc->inode->ino;
+
+ local->transaction.fop = afr_chmod_wind;
+ local->transaction.done = afr_chmod_done;
+ local->transaction.unwind = afr_chmod_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+
+/* {{{ fchmod */
+
+int
+afr_fchmod_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.fchmod.buf.st_ino = local->cont.fchmod.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.fchmod.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_fchmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.fchmod.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ afr_fchmod_unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_fchmod_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int i = 0;
+ int call_count = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_fchmod_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fchmod,
+ local->fd,
+ local->cont.fchmod.mode);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_fchmod_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int32_t
+afr_fchmod (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, mode_t mode)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->cont.fchmod.mode = mode;
+ local->cont.fchmod.ino = fd->inode->ino;
+
+ local->transaction.fop = afr_fchmod_wind;
+ local->transaction.done = afr_fchmod_done;
+ local->transaction.unwind = afr_fchmod_unwind;
+
+ local->fd = fd_ref (fd);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ chown */
+
+int
+afr_chown_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.chown.buf.st_ino = local->cont.chown.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.chown.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_chown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.chown.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind) {
+ local->transaction.unwind (frame, this);
+ }
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_chown_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_chown_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->chown,
+ &local->loc, local->cont.chown.uid,
+ local->cont.chown.gid);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_chown_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_chown (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, uid_t uid, gid_t gid)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->cont.chown.uid = uid;
+ local->cont.chown.gid = gid;
+ local->cont.chown.ino = loc->inode->ino;
+
+ local->transaction.fop = afr_chown_wind;
+ local->transaction.done = afr_chown_done;
+ local->transaction.unwind = afr_chown_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ chown */
+
+int
+afr_fchown_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.fchown.buf.st_ino = local->cont.fchown.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.fchown.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_fchown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.fchown.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind) {
+ local->transaction.unwind (frame, this);
+ }
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_fchown_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_fchown_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fchown,
+ local->fd, local->cont.fchown.uid,
+ local->cont.fchown.gid);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_fchown_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_fchown (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, uid_t uid, gid_t gid)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->cont.fchown.uid = uid;
+ local->cont.fchown.gid = gid;
+ local->cont.fchown.ino = fd->inode->ino;
+
+ local->transaction.fop = afr_fchown_wind;
+ local->transaction.done = afr_fchown_done;
+ local->transaction.unwind = afr_fchown_unwind;
+
+ local->fd = fd_ref (fd);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ writev */
+
+int
+afr_writev_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.writev.buf.st_ino = local->cont.writev.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.writev.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.writev.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_writev_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int i = 0;
+ int call_count = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_writev_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->writev,
+ local->fd,
+ local->cont.writev.vector,
+ local->cont.writev.count,
+ local->cont.writev.offset);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_writev_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (local->cont.writev.refs)
+ dict_unref (local->cont.writev.refs);
+ local->cont.writev.refs = NULL;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op = GF_FOP_WRITE;
+ local->cont.writev.vector = iov_dup (vector, count);
+ local->cont.writev.count = count;
+ local->cont.writev.offset = offset;
+ local->cont.writev.ino = fd->inode->ino;
+
+ if (frame->root->req_refs)
+ local->cont.writev.refs = dict_ref (frame->root->req_refs);
+
+ local->transaction.fop = afr_writev_wind;
+ local->transaction.done = afr_writev_done;
+ local->transaction.unwind = afr_writev_unwind;
+
+ local->fd = fd_ref (fd);
+
+ local->transaction.main_frame = frame;
+ if (fd->flags & O_APPEND) {
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ } else {
+ local->transaction.start = offset;
+ local->transaction.len = iov_length (vector, count);
+ }
+
+ local->transaction.pending = AFR_DATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ truncate */
+
+int
+afr_truncate_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.truncate.buf.st_ino = local->cont.truncate.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.truncate.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.truncate.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_truncate_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->truncate,
+ &local->loc,
+ local->cont.truncate.offset);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_truncate_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_truncate (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, off_t offset)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op_ret = -1;
+
+ local->cont.truncate.offset = offset;
+ local->cont.truncate.ino = loc->inode->ino;
+
+ local->transaction.fop = afr_truncate_wind;
+ local->transaction.done = afr_truncate_done;
+ local->transaction.unwind = afr_truncate_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = offset;
+ local->transaction.pending = AFR_DATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ ftruncate */
+
+
+int
+afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.ftruncate.buf.st_ino = local->cont.ftruncate.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.ftruncate.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.ftruncate.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_ftruncate_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->ftruncate,
+ local->fd, local->cont.ftruncate.offset);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_ftruncate_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_ftruncate (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, off_t offset)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op = GF_FOP_FTRUNCATE;
+ local->op_ret = -1;
+
+ local->cont.ftruncate.offset = offset;
+ local->cont.ftruncate.ino = fd->inode->ino;
+
+ local->transaction.fop = afr_ftruncate_wind;
+ local->transaction.done = afr_ftruncate_done;
+ local->transaction.unwind = afr_ftruncate_unwind;
+
+ local->fd = fd_ref (fd);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = offset;
+ local->transaction.pending = AFR_DATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ utimens */
+
+
+int
+afr_utimens_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.utimens.buf.st_ino = local->cont.utimens.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.utimens.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_utimens_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 1;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.utimens.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_utimens_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_utimens_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->utimens,
+ &local->loc,
+ local->cont.utimens.tv);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_utimens_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_utimens (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, struct timespec tv[2])
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op_ret = -1;
+
+ local->cont.utimens.tv[0] = tv[0];
+ local->cont.utimens.tv[1] = tv[1];
+
+ local->cont.utimens.ino = loc->inode->ino;
+
+ local->transaction.fop = afr_utimens_wind;
+ local->transaction.done = afr_utimens_done;
+ local->transaction.unwind = afr_utimens_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ setxattr */
+
+
+int
+afr_setxattr_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno)
+ }
+ return 0;
+}
+
+
+int
+afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_setxattr_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->setxattr,
+ &local->loc,
+ local->cont.setxattr.dict,
+ local->cont.setxattr.flags);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_setxattr_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_setxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *dict, int32_t flags)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op_ret = -1;
+
+ local->cont.setxattr.dict = dict_ref (dict);
+ local->cont.setxattr.flags = flags;
+
+ local->transaction.fop = afr_setxattr_wind;
+ local->transaction.done = afr_setxattr_done;
+ local->transaction.unwind = afr_setxattr_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ removexattr */
+
+
+int
+afr_removexattr_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno)
+ }
+ return 0;
+}
+
+
+int
+afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_removexattr_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->removexattr,
+ &local->loc,
+ local->cont.removexattr.name);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_removexattr_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op_ret = -1;
+
+ local->cont.removexattr.name = strdup (name);
+
+ local->transaction.fop = afr_removexattr_wind;
+ local->transaction.done = afr_removexattr_done;
+ local->transaction.unwind = afr_removexattr_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h
new file mode 100644
index 000000000..9c0b5cad3
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-inode-write.h
@@ -0,0 +1,63 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __INODE_WRITE_H__
+#define __INODE_WRITE_H__
+
+int32_t
+afr_chmod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode);
+
+int32_t
+afr_chown (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, uid_t uid, gid_t gid);
+
+int
+afr_fchown (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, uid_t uid, gid_t gid);
+
+int32_t
+afr_fchmod (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, mode_t mode);
+
+int32_t
+afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset);
+
+int32_t
+afr_truncate (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, off_t offset);
+
+int32_t
+afr_ftruncate (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, off_t offset);
+
+int32_t
+afr_utimens (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, struct timespec tv[2]);
+
+int32_t
+afr_setxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *dict, int32_t flags);
+
+int32_t
+afr_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name);
+
+#endif /* __INODE_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
new file mode 100644
index 000000000..45d065169
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -0,0 +1,1073 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "byte-order.h"
+
+#include "afr.h"
+#include "afr-transaction.h"
+#include "afr-self-heal-common.h"
+#include "afr-self-heal.h"
+
+
+/**
+ * select_source - select a source and return it
+ * TODO: take into account option 'favorite-child'
+ */
+
+int
+afr_sh_select_source (int sources[], int child_count)
+{
+ int i;
+ for (i = 0; i < child_count; i++)
+ if (sources[i])
+ return i;
+
+ return -1;
+}
+
+
+/**
+ * sink_count - return number of sinks in sources array
+ */
+
+int
+afr_sh_sink_count (int sources[], int child_count)
+{
+ int i;
+ int sinks = 0;
+ for (i = 0; i < child_count; i++)
+ if (!sources[i])
+ sinks++;
+ return sinks;
+}
+
+int
+afr_sh_source_count (int sources[], int child_count)
+{
+ int i;
+ int nsource = 0;
+
+ for (i = 0; i < child_count; i++)
+ if (sources[i])
+ nsource++;
+ return nsource;
+}
+
+
+int
+afr_sh_supress_errenous_children (int sources[], int child_errno[],
+ int child_count)
+{
+ int i = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (child_errno[i] && sources[i]) {
+ sources[i] = 0;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_supress_empty_children (int sources[], dict_t *xattr[],
+ struct stat *buf,
+ int child_count, const char *key)
+{
+ int i = 0;
+ int32_t *pending = NULL;
+ int ret = 0;
+ int all_xattr_missing = 1;
+
+ /* if the file was created by afr with xattrs */
+ for (i = 0; i < child_count; i++) {
+ if (!xattr[i])
+ continue;
+
+ ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending));
+ if (ret != 0) {
+ continue;
+ }
+
+ all_xattr_missing = 0;
+ break;
+ }
+
+ if (all_xattr_missing) {
+ /* supress 0byte files.. this avoids empty file created
+ by dir selfheal to overwrite the 'good' file */
+ for (i = 0; i < child_count; i++) {
+ if (!buf[i].st_size)
+ sources[i] = 0;
+ }
+ goto out;
+ }
+
+
+ for (i = 0; i < child_count; i++) {
+ if (!xattr[i]) {
+ sources[i] = 0;
+ continue;
+ }
+
+ ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending));
+ if (ret != 0) {
+ sources[i] = 0;
+ continue;
+ }
+
+ if (!pending) {
+ sources[i] = 0;
+ continue;
+ }
+ }
+
+out:
+ return 0;
+}
+
+
+void
+afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)
+{
+ afr_private_t * priv = this->private;
+
+ char *buf = NULL;
+ char *ptr = NULL;
+
+ int i, j;
+
+ /* 10 digits per entry + 1 space + '[' and ']' */
+ buf = MALLOC (priv->child_count * 11 + 8);
+
+ for (i = 0; i < priv->child_count; i++) {
+ ptr = buf;
+ ptr += sprintf (ptr, "[ ");
+ for (j = 0; j < priv->child_count; j++) {
+ ptr += sprintf (ptr, "%d ", pending_matrix[i][j]);
+ }
+ ptr += sprintf (ptr, "]");
+ gf_log (this->name, GF_LOG_DEBUG,
+ "pending_matrix: %s", buf);
+ }
+
+ FREE (buf);
+}
+
+
+void
+afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[],
+ int child_count, const char *key)
+{
+ int i = 0;
+ int j = 0;
+ int32_t *pending = NULL;
+ int ret = -1;
+
+ /* start clean */
+ for (i = 0; i < child_count; i++) {
+ for (j = 0; j < child_count; j++) {
+ pending_matrix[i][j] = 0;
+ }
+ }
+
+ for (i = 0; i < child_count; i++) {
+ if (!xattr[i])
+ continue;
+
+ pending = NULL;
+
+ ret = dict_get_ptr (xattr[i], (char *) key,
+ VOID(&pending));
+ if (ret != 0)
+ continue;
+
+ for (j = 0; j < child_count; j++) {
+ pending_matrix[i][j] = ntoh32 (pending[j]);
+ }
+ }
+}
+
+
+/**
+ * mark_sources: Mark all 'source' nodes and return number of source
+ * nodes found
+ */
+
+int
+afr_sh_mark_sources (int32_t *pending_matrix[], int sources[], int child_count)
+{
+ int i = 0;
+ int j = 0;
+
+ int nsources = 0;
+
+
+ /* start clean */
+ for (i = 0; i < child_count; i++) {
+ sources[i] = 0;
+ }
+
+ /*
+ Let's 'normalize' the pending matrix first,
+ by disregarding all pending entries that refer
+ to themselves
+ */
+ for (i = 0; i < child_count; i++) {
+ pending_matrix[i][i] = 0;
+ }
+
+ for (i = 0; i < child_count; i++) {
+ for (j = 0; j < child_count; j++) {
+ if (pending_matrix[j][i])
+ break;
+ }
+
+ if (j == child_count) {
+ nsources++;
+ sources[i] = 1;
+ }
+ }
+
+ return nsources;
+}
+
+
+void
+afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[],
+ int success[], int child_count)
+{
+ int i = 0;
+ int j = 0;
+
+ /* start clean */
+ for (i = 0; i < child_count; i++) {
+ for (j = 0; j < child_count; j++) {
+ delta_matrix[i][j] = 0;
+ }
+ }
+
+ for (i = 0; i < child_count; i++) {
+ for (j = 0; j < child_count; j++) {
+ if (!success[j])
+ continue;
+ delta_matrix[i][j] = -pending_matrix[i][j];
+ }
+ }
+}
+
+
+int
+afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[],
+ int child_count, const char *key)
+{
+ int i = 0;
+ int j = 0;
+
+ int ret = 0;
+
+ int32_t *pending = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (!xattr[i])
+ continue;
+
+ pending = CALLOC (sizeof (int32_t), child_count);
+ for (j = 0; j < child_count; j++) {
+ pending[j] = hton32 (delta_matrix[i][j]);
+ }
+
+ ret = dict_set_bin (xattr[i], (char *) key, pending,
+ child_count * sizeof (int32_t));
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int32_t *pending = NULL;
+ void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */
+
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xattr, AFR_METADATA_PENDING, &tmp_pending);
+
+ if (ret != 0)
+ return 0;
+
+ pending = tmp_pending;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == child_count)
+ continue;
+ if (pending[i])
+ return 1;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int32_t *pending = NULL;
+ void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */
+
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xattr, AFR_DATA_PENDING, &tmp_pending);
+
+ if (ret != 0)
+ return 0;
+
+ pending = tmp_pending;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == child_count)
+ continue;
+ if (pending[i])
+ return 1;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int32_t *pending = NULL;
+ void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */
+
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xattr, AFR_ENTRY_PENDING, &tmp_pending);
+
+ if (ret != 0)
+ return 0;
+
+ pending = tmp_pending;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == child_count)
+ continue;
+ if (pending[i])
+ return 1;
+ }
+
+ return 0;
+}
+
+
+
+/**
+ * is_matrix_zero - return true if pending matrix is all zeroes
+ */
+
+int
+afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count)
+{
+ int i, j;
+
+ for (i = 0; i < child_count; i++)
+ for (j = 0; j < child_count; j++)
+ if (pending_matrix[i][j])
+ return 0;
+ return 1;
+}
+
+
+int
+afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+// memset (sh->child_errno, 0, sizeof (int) * priv->child_count);
+ memset (sh->buf, 0, sizeof (struct stat) * priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i])
+ dict_unref (sh->xattr[i]);
+ sh->xattr[i] = NULL;
+ }
+
+ if (local->govinda_gOvinda) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "aborting selfheal of %s",
+ local->loc.path);
+ sh->completion_cbk (frame, this);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to metadata check on %s",
+ local->loc.path);
+ afr_self_heal_metadata (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+sh_missing_entries_unlck_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_missing_entries_done (frame, this);
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_finish (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_self_heal_t *sh = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unlocking %"PRId64"/%s on subvolume %s",
+ sh->parent_loc.inode->ino, local->loc.name,
+ priv->children[i]->name);
+
+ STACK_WIND (frame, sh_missing_entries_unlck_cbk,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &sh->parent_loc, local->loc.name,
+ ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+
+ if (!--call_count)
+ break;
+ }
+ }
+ return 0;
+}
+
+
+static int
+sh_destroy_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int op_errno, struct stat *stbuf)
+{
+ STACK_DESTROY (frame->root);
+ return 0;
+}
+
+
+static int
+sh_missing_entries_newentry_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ call_frame_t *chown_frame = NULL;
+ int call_count = 0;
+ int child_index = 0;
+ struct stat *buf = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ buf = &sh->buf[sh->source];
+ child_index = (long) cookie;
+
+ if (op_ret == 0) {
+ chown_frame = copy_frame (frame);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "chown %s to %d %d on subvolume %s",
+ local->loc.path, buf->st_uid, buf->st_gid,
+ priv->children[child_index]->name);
+
+ STACK_WIND (chown_frame, sh_destroy_cbk,
+ priv->children[child_index],
+ priv->children[child_index]->fops->chown,
+ &local->loc,
+ buf->st_uid, buf->st_gid);
+ }
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ sh_missing_entries_finish (frame, this);
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_mknod (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int enoent_count = 0;
+ int call_count = 0;
+ mode_t st_mode = 0;
+ dev_t st_dev = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++)
+ if (sh->child_errno[i] == ENOENT)
+ enoent_count++;
+
+ call_count = enoent_count;
+ local->call_count = call_count;
+
+ st_mode = sh->buf[sh->source].st_mode;
+ st_dev = sh->buf[sh->source].st_dev;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "mknod %s mode 0%o on %d subvolumes",
+ local->loc.path, st_mode, enoent_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->child_errno[i] == ENOENT) {
+ STACK_WIND_COOKIE (frame,
+ sh_missing_entries_newentry_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->mknod,
+ &local->loc, st_mode, st_dev);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_mkdir (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int enoent_count = 0;
+ int call_count = 0;
+ mode_t st_mode = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++)
+ if (sh->child_errno[i] == ENOENT)
+ enoent_count++;
+
+ call_count = enoent_count;
+ local->call_count = call_count;
+
+ st_mode = sh->buf[sh->source].st_mode;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "mkdir %s mode 0%o on %d subvolumes",
+ local->loc.path, st_mode, enoent_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->child_errno[i] == ENOENT) {
+ STACK_WIND_COOKIE (frame,
+ sh_missing_entries_newentry_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->mkdir,
+ &local->loc, st_mode);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_symlink (call_frame_t *frame, xlator_t *this,
+ const char *link)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int enoent_count = 0;
+ int call_count = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++)
+ if (sh->child_errno[i] == ENOENT)
+ enoent_count++;
+
+ call_count = enoent_count;
+ local->call_count = call_count;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "symlink %s -> %s on %d subvolumes",
+ local->loc.path, link, enoent_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->child_errno[i] == ENOENT) {
+ STACK_WIND_COOKIE (frame,
+ sh_missing_entries_newentry_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->symlink,
+ link, &local->loc);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_readlink_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ const char *link)
+{
+ if (op_ret > 0)
+ sh_missing_entries_symlink (frame, this, link);
+ else
+ sh_missing_entries_finish (frame, this);
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_readlink (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ STACK_WIND (frame, sh_missing_entries_readlink_cbk,
+ priv->children[sh->source],
+ priv->children[sh->source]->fops->readlink,
+ &local->loc, 4096);
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_create (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int type = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int enoent_count = 0;
+ int govinda_gOvinda = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->child_errno[i]) {
+ if (sh->child_errno[i] == ENOENT)
+ enoent_count++;
+ } else {
+ if (type) {
+ if (type != (sh->buf[i].st_mode & S_IFMT))
+ govinda_gOvinda = 1;
+ } else {
+ sh->source = i;
+ type = sh->buf[i].st_mode & S_IFMT;
+ }
+ }
+ }
+
+ if (govinda_gOvinda) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "conflicing filetypes exist for path %s. returning.",
+ local->loc.path);
+
+ local->govinda_gOvinda = 1;
+ sh_missing_entries_finish (frame, this);
+ return 0;
+ }
+
+ if (!type) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no source found for %s. all nodes down?. returning.",
+ local->loc.path);
+ /* subvolumes down and/or file does not exist */
+ sh_missing_entries_finish (frame, this);
+ return 0;
+ }
+
+ if (enoent_count == 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no missing files - %s. proceeding to metadata check",
+ local->loc.path);
+ /* proceed to next step - metadata self-heal */
+ sh_missing_entries_finish (frame, this);
+ return 0;
+ }
+
+ switch (type) {
+ case S_IFSOCK:
+ case S_IFREG:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFIFO:
+ sh_missing_entries_mknod (frame, this);
+ break;
+ case S_IFLNK:
+ sh_missing_entries_readlink (frame, this);
+ break;
+ case S_IFDIR:
+ sh_missing_entries_mkdir (frame, this);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR,
+ "unknown file type: 0%o", type);
+ local->govinda_gOvinda = 1;
+ sh_missing_entries_finish (frame, this);
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ int child_index = 0;
+ afr_local_t *local = NULL;
+ int call_count = 0;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "path %s on subvolume %s is of mode 0%o",
+ local->loc.path,
+ priv->children[child_index]->name,
+ buf->st_mode);
+
+ local->self_heal.buf[child_index] = *buf;
+ } else {
+ gf_log (this->name, GF_LOG_WARNING,
+ "path %s on subvolume %s => -1 (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+
+ local->self_heal.child_errno[child_index] = op_errno;
+ }
+
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ sh_missing_entries_create (frame, this);
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_lookup (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xattr_req = NULL;
+ int ret = -1;
+
+ local = frame->local;
+ call_count = local->child_count;
+ priv = this->private;
+
+ local->call_count = call_count;
+
+ xattr_req = dict_new();
+
+ if (xattr_req)
+ ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING,
+ priv->child_count * sizeof(int32_t));
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "looking up %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame,
+ sh_missing_entries_lookup_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &local->loc, xattr_req);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ sh->op_failed = 1;
+
+ gf_log (this->name,
+ (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
+ "locking inode of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inode of %s on child %d locked",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed == 1) {
+ sh_missing_entries_finish (frame, this);
+ return 0;
+ }
+
+ sh_missing_entries_lookup (frame, this);
+ }
+
+ return 0;
+}
+
+
+static int
+afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "attempting to recreate missing entries for path=%s",
+ local->loc.path);
+
+ afr_build_parent_loc (&sh->parent_loc, &local->loc);
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, sh_missing_entries_lk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &sh->parent_loc, local->loc.name,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_self_heal (call_frame_t *frame, xlator_t *this,
+ int (*completion_cbk) (call_frame_t *, xlator_t *))
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "performing self heal on %s (metadata=%d data=%d entry=%d)",
+ local->loc.path,
+ local->need_metadata_self_heal,
+ local->need_data_self_heal,
+ local->need_entry_self_heal);
+
+ sh->completion_cbk = completion_cbk;
+
+ sh->buf = CALLOC (priv->child_count, sizeof (struct stat));
+ sh->child_errno = CALLOC (priv->child_count, sizeof (int));
+ sh->success = CALLOC (priv->child_count, sizeof (int));
+ sh->xattr = CALLOC (priv->child_count, sizeof (dict_t *));
+ sh->sources = CALLOC (sizeof (*sh->sources), priv->child_count);
+
+ sh->pending_matrix = CALLOC (sizeof (int32_t *), priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ sh->pending_matrix[i] = CALLOC (sizeof (int32_t),
+ priv->child_count);
+ }
+
+ sh->delta_matrix = CALLOC (sizeof (int32_t *), priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ sh->delta_matrix[i] = CALLOC (sizeof (int32_t),
+ priv->child_count);
+ }
+
+ if (local->success_count && local->enoent_count) {
+ afr_self_heal_missing_entries (frame, this);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to metadata check on %s",
+ local->loc.path);
+ afr_sh_missing_entries_done (frame, this);
+ }
+
+ return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h
new file mode 100644
index 000000000..9dd597f07
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-common.h
@@ -0,0 +1,66 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __AFR_SELF_HEAL_COMMON_H__
+#define __AFR_SELF_HEAL_COMMON_H__
+
+#define FILE_HAS_HOLES(buf) (((buf)->st_size) > ((buf)->st_blocks * 512))
+
+int
+afr_sh_select_source (int sources[], int child_count);
+
+int
+afr_sh_sink_count (int sources[], int child_count);
+
+int
+afr_sh_source_count (int sources[], int child_count);
+
+int
+afr_sh_supress_errenous_children (int sources[], int child_errno[],
+ int child_count);
+
+int
+afr_sh_supress_empty_children (int sources[], dict_t *xattr[],
+ struct stat *buf,
+ int child_count, const char *key);
+
+void
+afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this);
+
+void
+afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[],
+ int child_count, const char *key);
+
+void
+afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[],
+ int32_t success[], int child_count);
+
+int
+afr_sh_mark_sources (int32_t *pending_matrix[], int sources[],
+ int child_count);
+
+int
+afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[],
+ int child_count, const char *key);
+
+int
+afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count);
+
+
+#endif /* __AFR_SELF_HEAL_COMMON_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
new file mode 100644
index 000000000..3a48da485
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -0,0 +1,1030 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+#include "byte-order.h"
+
+#include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-self-heal-common.h"
+
+
+
+int
+afr_sh_data_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ /*
+ TODO: cleanup sh->*
+ */
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "self heal of %s completed",
+ local->loc.path);
+
+ sh->completion_cbk (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ fd_unref (sh->healing_fd);
+ sh->healing_fd = NULL;
+ afr_sh_data_done (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_close (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_self_heal_t *sh = NULL;
+ int i = 0;
+ int call_count = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ if (!sh->healing_fd) {
+ afr_sh_data_done (frame, this);
+ return 0;
+ }
+
+ call_count = sh->active_sinks + 1;
+ local->call_count = call_count;
+
+
+ /* closed source */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "closing fd of %s on %s",
+ local->loc.path, priv->children[sh->source]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
+ (void *) (long) sh->source,
+ priv->children[sh->source],
+ priv->children[sh->source]->fops->flush,
+ sh->healing_fd);
+ call_count--;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] || !local->child_up[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "closing fd of %s on %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->flush,
+ sh->healing_fd);
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "locking inode of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inode of %s on child %d locked",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_data_close (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_unlock (call_frame_t *frame, xlator_t *this)
+{
+ struct flock flock;
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t * sh = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ flock.l_start = 0;
+ flock.l_len = 0;
+ flock.l_type = F_UNLCK;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unlocking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_unlck_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ &local->loc, F_SETLK, &flock);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_finish (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "finishing data selfheal of %s", local->loc.path);
+
+ afr_sh_data_unlock (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_data_finish (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
+ dict_t **erase_xattr = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+
+ afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix,
+ sh->success, priv->child_count);
+
+ erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i]) {
+ call_count++;
+
+ erase_xattr[i] = get_new_dict();
+ dict_ref (erase_xattr[i]);
+ }
+ }
+
+ afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr,
+ priv->child_count, AFR_DATA_PENDING);
+
+ local->call_count = call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!erase_xattr[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "erasing pending flags from %s on %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
+ if (!--call_count)
+ break;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (erase_xattr[i]) {
+ dict_unref (erase_xattr[i]);
+ }
+ }
+ FREE (erase_xattr);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+ int child_index = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1)
+ gf_log (this->name, GF_LOG_ERROR,
+ "ftruncate of %s on subvolume %s failed (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ else
+ gf_log (this->name, GF_LOG_DEBUG,
+ "ftruncate of %s on subvolume %s completed",
+ local->loc.path,
+ priv->children[child_index]->name);
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_data_erase_pending (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int *sources = NULL;
+ int call_count = 0;
+ int i = 0;
+
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sources = sh->sources;
+ call_count = sh->active_sinks;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i] || !local->child_up[i])
+ continue;
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->ftruncate,
+ sh->healing_fd, sh->file_size);
+
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this);
+
+int
+afr_sh_data_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "wrote %d bytes of data from %s to child %d, offset %"PRId64"",
+ op_ret, local->loc.path, child_index, sh->offset - op_ret);
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "write to %s failed on subvolume %s (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ sh->op_failed = 1;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_data_read_write_iter (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_read_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct iovec *vector, int32_t count, struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ int child_index = (long) cookie;
+ int i = 0;
+ int call_count = 0;
+
+ off_t offset;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ call_count = sh->active_sinks;
+
+ local->call_count = call_count;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "read %d bytes of data from %s on child %d, offset %"PRId64"",
+ op_ret, local->loc.path, child_index, sh->offset);
+
+ if (op_ret <= 0) {
+ afr_sh_data_trim_sinks (frame, this);
+ return 0;
+ }
+
+ /* what if we read less than block size? */
+ offset = sh->offset;
+ sh->offset += op_ret;
+
+ frame->root->req_refs = frame->root->rsp_refs;
+
+ if (sh->file_has_holes) {
+ if (iov_0filled (vector, count) == 0) {
+ /* the iter function depends on the
+ sh->offset already being updated
+ above
+ */
+ afr_sh_data_read_write_iter (frame, this);
+ goto out;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] || !local->child_up[i])
+ continue;
+
+ /* this is a sink, so write to it */
+ STACK_WIND_COOKIE (frame, afr_sh_data_write_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->writev,
+ sh->healing_fd, vector, count, offset);
+
+ if (!--call_count)
+ break;
+ }
+
+out:
+ return 0;
+}
+
+
+int
+afr_sh_data_read_write (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_read_cbk,
+ (void *) (long) sh->source,
+ priv->children[sh->source],
+ priv->children[sh->source]->fops->readv,
+ sh->healing_fd, sh->block_size,
+ sh->offset);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ if (sh->op_failed) {
+ afr_sh_data_finish (frame, this);
+ goto out;
+ }
+
+ if (sh->offset >= sh->file_size) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "closing fd's of %s",
+ local->loc.path);
+ afr_sh_data_trim_sinks (frame, this);
+
+ goto out;
+ }
+
+ afr_sh_data_read_write (frame, this);
+
+out:
+ return 0;
+}
+
+
+int
+afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ /* TODO: some of the open's might fail.
+ In that case, modify cleanup fn to send flush on those
+ fd's which are already open */
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "open of %s failed on child %s (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ sh->op_failed = 1;
+ }
+
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed) {
+ afr_sh_data_finish (frame, this);
+ return 0;
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "fd for %s opened, commencing sync",
+ local->loc.path);
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "sourcing file %s from %s to other sinks",
+ local->loc.path, priv->children[sh->source]->name);
+
+ afr_sh_data_read_write (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_open (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int call_count = 0;
+
+ int source = -1;
+ int *sources = NULL;
+
+ fd_t *fd = NULL;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = sh->active_sinks + 1;
+ local->call_count = call_count;
+
+ fd = fd_create (local->loc.inode, frame->root->pid);
+ sh->healing_fd = fd;
+
+ source = local->self_heal.source;
+ sources = local->self_heal.sources;
+
+ sh->block_size = 65536;
+ sh->file_size = sh->buf[source].st_size;
+
+ if (FILE_HAS_HOLES (&sh->buf[source]))
+ sh->file_has_holes = 1;
+
+ /* open source */
+ STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk,
+ (void *) (long) source,
+ priv->children[source],
+ priv->children[source]->fops->open,
+ &local->loc, O_RDONLY|O_LARGEFILE, fd);
+ call_count--;
+
+ /* open sinks */
+ for (i = 0; i < priv->child_count; i++) {
+ if(sources[i] || !local->child_up[i])
+ continue;
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->open,
+ &local->loc,
+ O_WRONLY|O_LARGEFILE, fd);
+
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int active_sinks = 0;
+ int source = 0;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = sh->source;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] == 0 && local->child_up[i] == 1) {
+ active_sinks++;
+ sh->success[i] = 1;
+ }
+ }
+ sh->success[source] = 1;
+
+ if (active_sinks == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no active sinks for performing self-heal on file %s",
+ local->loc.path);
+ afr_sh_data_finish (frame, this);
+ return 0;
+ }
+ sh->active_sinks = active_sinks;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "syncing data of %s from subvolume %s to %d active sinks",
+ local->loc.path, priv->children[source]->name, active_sinks);
+
+ afr_sh_data_open (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int nsources = 0;
+ int source = 0;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr,
+ priv->child_count, AFR_DATA_PENDING);
+
+ afr_sh_print_pending_matrix (sh->pending_matrix, this);
+
+
+ afr_sh_mark_sources (sh->pending_matrix, sh->sources,
+ priv->child_count);
+
+ afr_sh_supress_empty_children (sh->sources, sh->xattr, sh->buf,
+ priv->child_count, AFR_DATA_PENDING);
+
+ afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
+ priv->child_count);
+
+ nsources = afr_sh_source_count (sh->sources, priv->child_count);
+
+ if ((nsources == 0)
+ && (priv->favorite_child != -1)
+ && (sh->child_errno[priv->favorite_child] == 0)) {
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "Picking favorite child %s as authentic source to resolve conflicting data of %s",
+ priv->children[priv->favorite_child]->name,
+ local->loc.path);
+
+ sh->sources[priv->favorite_child] = 1;
+
+ nsources = afr_sh_source_count (sh->sources,
+ priv->child_count);
+ }
+
+ if (nsources == 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to resolve conflicting data of %s. "
+ "Please resolve manually by deleting the file %s "
+ "from all but the preferred subvolume. "
+ "Please consider 'option favorite-child <>'",
+ local->loc.path, local->loc.path);
+
+ local->govinda_gOvinda = 1;
+
+ afr_sh_data_finish (frame, this);
+ return 0;
+ }
+
+ source = afr_sh_select_source (sh->sources, priv->child_count);
+ sh->source = source;
+
+ /* detect changes not visible through pending flags -- JIC */
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || sh->child_errno[i])
+ continue;
+
+ if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source]))
+ sh->sources[i] = 0;
+ }
+
+ afr_sh_data_sync_prepare (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_lookup_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret != -1) {
+ sh->xattr[child_index] = dict_ref (xattr);
+ sh->buf[child_index] = *buf;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_data_fix (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_lookup (call_frame_t *frame, xlator_t *this)
+{
+ afr_self_heal_t *sh = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr_req = NULL;
+
+ int call_count = 0;
+ int i = 0;
+ int ret = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ xattr_req = dict_new();
+ if (xattr_req)
+ ret = dict_set_uint64 (xattr_req, AFR_DATA_PENDING,
+ priv->child_count * sizeof(int32_t));
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_sh_data_lookup_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &local->loc, xattr_req);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+ /* TODO: what if lock fails? */
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ sh->op_failed = 1;
+
+ gf_log (this->name,
+ (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
+ "locking of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inode of %s on child %d locked",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed) {
+ afr_sh_data_finish (frame, this);
+ return 0;
+ }
+
+ afr_sh_data_lookup (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_lock (call_frame_t *frame, xlator_t *this)
+{
+ struct flock flock;
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t * sh = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ flock.l_start = 0;
+ flock.l_len = 0;
+ flock.l_type = F_WRLCK;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "locking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_lock_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ &local->loc, F_SETLK, &flock);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_self_heal_data (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = this->private;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ if (local->need_data_self_heal && priv->data_self_heal) {
+ afr_sh_data_lock (frame, this);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "not doing data self heal on %s",
+ local->loc.path);
+ afr_sh_data_done (frame, this);
+ }
+
+ return 0;
+}
+
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
new file mode 100644
index 000000000..ec341922e
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -0,0 +1,2038 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+#include "byte-order.h"
+
+#include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-self-heal-common.h"
+
+
+
+int
+afr_sh_entry_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ /*
+ TODO: cleanup sh->*
+ */
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "self heal of %s completed",
+ local->loc.path);
+
+ sh->completion_cbk (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+ /* TODO: what if lock fails? */
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "unlocking inode of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unlocked inode of %s on child %d",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->healing_fd)
+ fd_unref (sh->healing_fd);
+ sh->healing_fd = NULL;
+ afr_sh_entry_done (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t * sh = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unlocking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_entry_unlck_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &local->loc, NULL,
+ ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_finish (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "finishing entry selfheal of %s", local->loc.path);
+
+ afr_sh_entry_unlock (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_entry_finish (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
+ dict_t **erase_xattr = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+
+ afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix,
+ sh->success, priv->child_count);
+
+ erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i]) {
+ call_count++;
+
+ erase_xattr[i] = get_new_dict();
+ dict_ref (erase_xattr[i]);
+ }
+ }
+
+ afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr,
+ priv->child_count, AFR_ENTRY_PENDING);
+
+ local->call_count = call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!erase_xattr[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "erasing pending flags from %s on %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
+ if (!--call_count)
+ break;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (erase_xattr[i]) {
+ dict_unref (erase_xattr[i]);
+ }
+ }
+ FREE (erase_xattr);
+
+ return 0;
+}
+
+
+
+static int
+next_active_source (call_frame_t *frame, xlator_t *this,
+ int current_active_source)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int source = -1;
+ int next_active_source = -1;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ source = sh->source;
+
+ if (source != -1) {
+ if (current_active_source != source)
+ next_active_source = source;
+ goto out;
+ }
+
+ /*
+ the next active sink becomes the source for the
+ 'conservative decision' of merging all entries
+ */
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((sh->sources[i] == 0)
+ && (local->child_up[i] == 1)
+ && (i > current_active_source)) {
+
+ next_active_source = i;
+ break;
+ }
+ }
+out:
+ return next_active_source;
+}
+
+
+
+static int
+next_active_sink (call_frame_t *frame, xlator_t *this,
+ int current_active_sink)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int next_active_sink = -1;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ /*
+ the next active sink becomes the source for the
+ 'conservative decision' of merging all entries
+ */
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((sh->sources[i] == 0)
+ && (local->child_up[i] == 1)
+ && (i > current_active_sink)) {
+
+ next_active_sink = i;
+ break;
+ }
+ }
+
+ return next_active_sink;
+}
+
+
+int
+build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name)
+{
+ int ret = -1;
+
+ if (!child) {
+ goto out;
+ }
+
+ if (strcmp (parent->path, "/") == 0)
+ asprintf ((char **)&child->path, "/%s", name);
+ else
+ asprintf ((char **)&child->path, "%s/%s", parent->path, name);
+
+ if (!child->path) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ child->name = strrchr (child->path, '/');
+ if (child->name)
+ child->name++;
+
+ child->parent = inode_ref (parent->inode);
+ child->inode = inode_new (parent->inode->table);
+
+ if (!child->inode) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (ret == -1)
+ loc_wipe (child);
+
+ return ret;
+}
+
+
+int
+afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this);
+
+int
+afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
+ int active_src);
+
+int
+afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_entry_expunge_subvol (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+ afr_self_heal_t *expunge_sh = NULL;
+ int active_src = 0;
+ call_frame_t *frame = NULL;
+
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+ expunge_sh = &expunge_local->self_heal;
+ frame = expunge_sh->sh_frame;
+
+ active_src = (long) cookie;
+
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "removed %s on %s",
+ expunge_local->loc.path,
+ priv->children[active_src]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "removing %s on %s failed (%s)",
+ expunge_local->loc.path,
+ priv->children[active_src]->name,
+ strerror (op_errno));
+ }
+
+ AFR_STACK_DESTROY (expunge_frame);
+ afr_sh_entry_expunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "removing directory %s on %s",
+ expunge_local->loc.path, priv->children[active_src]->name);
+
+ STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
+ (void *) (long) active_src,
+ priv->children[active_src],
+ priv->children[active_src]->fops->rmdir,
+ &expunge_local->loc);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "unlinking file %s on %s",
+ expunge_local->loc.path, priv->children[active_src]->name);
+
+ STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
+ (void *) (long) active_src,
+ priv->children[active_src],
+ priv->children[active_src]->fops->unlink,
+ &expunge_local->loc);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this,
+ int active_src, struct stat *buf)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+ afr_self_heal_t *expunge_sh = NULL;
+ int source = 0;
+ call_frame_t *frame = NULL;
+ int type = 0;
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+ expunge_sh = &expunge_local->self_heal;
+ frame = expunge_sh->sh_frame;
+ source = expunge_sh->source;
+
+ type = (buf->st_mode & S_IFMT);
+
+ switch (type) {
+ case S_IFSOCK:
+ case S_IFREG:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFIFO:
+ case S_IFLNK:
+ afr_sh_entry_expunge_unlink (expunge_frame, this, active_src);
+
+ break;
+ case S_IFDIR:
+ afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s has unknown file type on %s: 0%o",
+ expunge_local->loc.path,
+ priv->children[source]->name, type);
+ goto out;
+ break;
+ }
+
+ return 0;
+out:
+ AFR_STACK_DESTROY (expunge_frame);
+ afr_sh_entry_expunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *x)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+ afr_self_heal_t *expunge_sh = NULL;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+ expunge_sh = &expunge_local->self_heal;
+ frame = expunge_sh->sh_frame;
+ active_src = (long) cookie;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "lookup of %s on %s failed (%s)",
+ expunge_local->loc.path,
+ priv->children[active_src]->name,
+ strerror (op_errno));
+ goto out;
+ }
+
+ afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf);
+
+ return 0;
+out:
+ AFR_STACK_DESTROY (expunge_frame);
+ afr_sh_entry_expunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "looking up %s on %s",
+ expunge_local->loc.path, priv->children[active_src]->name);
+
+ STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk,
+ (void *) (long) active_src,
+ priv->children[active_src],
+ priv->children[active_src]->fops->lookup,
+ &expunge_local->loc, 0);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *x)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+ afr_self_heal_t *expunge_sh = NULL;
+ int source = 0;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+ expunge_sh = &expunge_local->self_heal;
+ frame = expunge_sh->sh_frame;
+ active_src = expunge_sh->active_source;
+ source = (long) cookie;
+
+ if (op_ret == -1 && op_errno == ENOENT) {
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "missing entry %s on %s",
+ expunge_local->loc.path,
+ priv->children[source]->name);
+
+ afr_sh_entry_expunge_purge (expunge_frame, this, active_src);
+
+ return 0;
+ }
+
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s exists under %s",
+ expunge_local->loc.path,
+ priv->children[source]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "looking up %s under %s failed (%s)",
+ expunge_local->loc.path,
+ priv->children[source]->name,
+ strerror (op_errno));
+ }
+
+ AFR_STACK_DESTROY (expunge_frame);
+ afr_sh_entry_expunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this,
+ char *name)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int ret = -1;
+ call_frame_t *expunge_frame = NULL;
+ afr_local_t *expunge_local = NULL;
+ afr_self_heal_t *expunge_sh = NULL;
+ int active_src = 0;
+ int source = 0;
+ int op_errno = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ active_src = sh->active_source;
+ source = sh->source;
+
+ if ((strcmp (name, ".") == 0)
+ || (strcmp (name, "..") == 0)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "skipping inspection of %s under %s",
+ name, local->loc.path);
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inspecting existance of %s under %s",
+ name, local->loc.path);
+
+ expunge_frame = copy_frame (frame);
+ if (!expunge_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (expunge_local, afr_local_t, out);
+
+ expunge_frame->local = expunge_local;
+ expunge_sh = &expunge_local->self_heal;
+ expunge_sh->sh_frame = frame;
+ expunge_sh->active_source = active_src;
+
+ ret = build_child_loc (this, &expunge_local->loc, &local->loc, name);
+ if (ret != 0) {
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "looking up %s on %s", expunge_local->loc.path,
+ priv->children[source]->name);
+
+ STACK_WIND_COOKIE (expunge_frame,
+ afr_sh_entry_expunge_entry_cbk,
+ (void *) (long) source,
+ priv->children[source],
+ priv->children[source]->fops->lookup,
+ &expunge_local->loc, 0);
+
+ ret = 0;
+out:
+ if (ret == -1)
+ afr_sh_entry_expunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ gf_dirent_t *entries)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ gf_dirent_t *entry = NULL;
+ off_t last_offset = 0;
+ int active_src = 0;
+ int entry_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ active_src = sh->active_source;
+
+ if (op_ret <= 0) {
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "readdir of %s on subvolume %s failed (%s)",
+ local->loc.path,
+ priv->children[active_src]->name,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir of %s on subvolume %s complete",
+ local->loc.path,
+ priv->children[active_src]->name);
+ }
+
+ afr_sh_entry_expunge_all (frame, this);
+ return 0;
+ }
+
+ list_for_each_entry (entry, &entries->list, list) {
+ last_offset = entry->d_off;
+ entry_count++;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir'ed %d entries from %s",
+ entry_count, priv->children[active_src]->name);
+
+ sh->offset = last_offset;
+ local->call_count = entry_count;
+
+ list_for_each_entry (entry, &entries->list, list) {
+ afr_sh_entry_expunge_entry (frame, this, entry->d_name);
+ }
+
+ return 0;
+}
+
+int
+afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk,
+ priv->children[active_src],
+ priv->children[active_src]->fops->readdir,
+ sh->healing_fd, sh->block_size, sh->offset);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int active_src = -1;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sh->offset = 0;
+
+ if (sh->source == -1) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no active sources for %s to expunge entries",
+ local->loc.path);
+ goto out;
+ }
+
+ active_src = next_active_sink (frame, this, sh->active_source);
+ sh->active_source = active_src;
+
+ if (sh->op_failed) {
+ goto out;
+ }
+
+ if (active_src == -1) {
+ /* completed creating missing files on all subvolumes */
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "expunging entries of %s on %s to other sinks",
+ local->loc.path, priv->children[active_src]->name);
+
+ afr_sh_entry_expunge_subvol (frame, this, active_src);
+
+ return 0;
+out:
+ afr_sh_entry_erase_pending (frame, this);
+ return 0;
+
+}
+
+
+int
+afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this);
+
+int
+afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this,
+ int active_src);
+
+int
+afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_entry_impunge_subvol (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_utimens_cbk (call_frame_t *impunge_frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct stat *stbuf)
+{
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+ int child_index = 0;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+ child_index = (long) cookie;
+
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "utimes set for %s on %s",
+ impunge_local->loc.path,
+ priv->children[child_index]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting utimes of %s on %s failed (%s)",
+ impunge_local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ }
+
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_chown_cbk (call_frame_t *impunge_frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct stat *stbuf)
+{
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+ int child_index = 0;
+ struct timespec ts[2];
+
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+ child_index = (long) cookie;
+
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "ownership of %s on %s changed",
+ impunge_local->loc.path,
+ priv->children[child_index]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting ownership of %s on %s failed (%s)",
+ impunge_local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ goto out;
+ }
+
+#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC
+ ts[0] = impunge_local->cont.lookup.buf.st_atim;
+ ts[1] = impunge_local->cont.lookup.buf.st_mtim;
+#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC
+ ts[0] = impunge_local->cont.lookup.buf.st_atimespec;
+ ts[1] = impunge_local->cont.lookup.buf.st_mtimespec;
+#else
+ ts[0].tv_sec = impunge_local->cont.lookup.buf.st_atime;
+ ts[1].tv_sec = impunge_local->cont.lookup.buf.st_mtime;
+#endif
+ STACK_WIND_COOKIE (impunge_frame,
+ afr_sh_entry_impunge_utimens_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->utimens,
+ &impunge_local->loc, ts);
+
+ return 0;
+
+out:
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+ int child_index = 0;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+
+ child_index = (long) cookie;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "creation of %s on %s failed (%s)",
+ impunge_local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting ownership of %s on %s to %d/%d",
+ impunge_local->loc.path,
+ priv->children[child_index]->name,
+ impunge_local->cont.lookup.buf.st_uid,
+ impunge_local->cont.lookup.buf.st_gid);
+
+ inode->st_mode = stbuf->st_mode;
+
+ STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_chown_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->chown,
+ &impunge_local->loc,
+ impunge_local->cont.lookup.buf.st_uid,
+ impunge_local->cont.lookup.buf.st_gid);
+ return 0;
+
+out:
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this,
+ int child_index, struct stat *stbuf)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "creating file %s mode=0%o dev=0x%"GF_PRI_DEV" on %s",
+ impunge_local->loc.path,
+ stbuf->st_mode, stbuf->st_rdev,
+ priv->children[child_index]->name);
+
+ STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->mknod,
+ &impunge_local->loc,
+ stbuf->st_mode, stbuf->st_rdev);
+
+ return 0;
+}
+
+
+
+int
+afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this,
+ int child_index, struct stat *stbuf)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "creating directory %s mode=0%o on %s",
+ impunge_local->loc.path,
+ stbuf->st_mode,
+ priv->children[child_index]->name);
+
+ STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->mkdir,
+ &impunge_local->loc, stbuf->st_mode);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this,
+ int child_index, const char *linkname)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "creating symlink %s -> %s on %s",
+ impunge_local->loc.path, linkname,
+ priv->children[child_index]->name);
+
+ STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->symlink,
+ linkname, &impunge_local->loc);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ const char *linkname)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int child_index = -1;
+ call_frame_t *frame = NULL;
+ int call_count = -1;
+ int active_src = -1;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+ active_src = impunge_sh->active_source;
+
+ child_index = (long) cookie;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "readlink of %s on %s failed (%s)",
+ impunge_local->loc.path,
+ priv->children[active_src]->name,
+ strerror (op_errno));
+ goto out;
+ }
+
+ afr_sh_entry_impunge_symlink (impunge_frame, this, child_index,
+ linkname);
+ return 0;
+
+out:
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this,
+ int child_index, struct stat *stbuf)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int active_src = -1;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ active_src = impunge_sh->active_source;
+
+ STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk,
+ (void *) (long) child_index,
+ priv->children[active_src],
+ priv->children[active_src]->fops->readlink,
+ &impunge_local->loc, 4096);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_recreate_lookup_cbk (call_frame_t *impunge_frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf,
+ dict_t *xattr)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int active_src = 0;
+ int type = 0;
+ int child_index = 0;
+ call_frame_t *frame = NULL;
+ int call_count = 0;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+
+ child_index = (long) cookie;
+
+ active_src = impunge_sh->active_source;
+
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "looking up %s on %s (for %s) failed (%s)",
+ impunge_local->loc.path,
+ priv->children[active_src]->name,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ goto out;
+ }
+
+ impunge_local->cont.lookup.buf = *buf;
+ type = (buf->st_mode & S_IFMT);
+
+ switch (type) {
+ case S_IFSOCK:
+ case S_IFREG:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFIFO:
+ afr_sh_entry_impunge_mknod (impunge_frame, this,
+ child_index, buf);
+ break;
+ case S_IFLNK:
+ afr_sh_entry_impunge_readlink (impunge_frame, this,
+ child_index, buf);
+ break;
+ case S_IFDIR:
+ afr_sh_entry_impunge_mkdir (impunge_frame, this,
+ child_index, buf);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s has unknown file type on %s: 0%o",
+ impunge_local->loc.path,
+ priv->children[active_src]->name, type);
+ goto out;
+ break;
+ }
+
+ return 0;
+
+out:
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_recreate (call_frame_t *impunge_frame, xlator_t *this,
+ int child_index)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int active_src = 0;
+
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+
+ active_src = impunge_sh->active_source;
+
+ STACK_WIND_COOKIE (impunge_frame,
+ afr_sh_entry_impunge_recreate_lookup_cbk,
+ (void *) (long) child_index,
+ priv->children[active_src],
+ priv->children[active_src]->fops->lookup,
+ &impunge_local->loc, 0);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_entry_cbk (call_frame_t *impunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *x)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int call_count = 0;
+ int child_index = 0;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+ child_index = (long) cookie;
+ active_src = impunge_sh->active_source;
+
+ if (op_ret == -1 && op_errno == ENOENT) {
+ /* decrease call_count in recreate-callback */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "missing entry %s on %s",
+ impunge_local->loc.path,
+ priv->children[child_index]->name);
+
+ afr_sh_entry_impunge_recreate (impunge_frame, this,
+ child_index);
+ return 0;
+ }
+
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s exists under %s",
+ impunge_local->loc.path,
+ priv->children[child_index]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "looking up %s under %s failed (%s)",
+ impunge_local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ }
+
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this,
+ char *name)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int ret = -1;
+ call_frame_t *impunge_frame = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int active_src = 0;
+ int i = 0;
+ int call_count = 0;
+ int op_errno = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ active_src = sh->active_source;
+
+ if ((strcmp (name, ".") == 0)
+ || (strcmp (name, "..") == 0)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "skipping inspection of %s under %s",
+ name, local->loc.path);
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inspecting existance of %s under %s",
+ name, local->loc.path);
+
+ impunge_frame = copy_frame (frame);
+ if (!impunge_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (impunge_local, afr_local_t, out);
+
+ impunge_frame->local = impunge_local;
+ impunge_sh = &impunge_local->self_heal;
+ impunge_sh->sh_frame = frame;
+ impunge_sh->active_source = active_src;
+
+ ret = build_child_loc (this, &impunge_local->loc, &local->loc, name);
+ if (ret != 0) {
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == active_src)
+ continue;
+ if (local->child_up[i] == 0)
+ continue;
+ if (sh->sources[i] == 1)
+ continue;
+ call_count++;
+ }
+
+ impunge_local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == active_src)
+ continue;
+ if (local->child_up[i] == 0)
+ continue;
+ if (sh->sources[i] == 1)
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "looking up %s on %s", impunge_local->loc.path,
+ priv->children[i]->name);
+
+ STACK_WIND_COOKIE (impunge_frame,
+ afr_sh_entry_impunge_entry_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &impunge_local->loc, 0);
+
+ if (!--call_count)
+ break;
+ }
+
+ ret = 0;
+out:
+ if (ret == -1)
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ gf_dirent_t *entries)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ gf_dirent_t *entry = NULL;
+ off_t last_offset = 0;
+ int active_src = 0;
+ int entry_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ active_src = sh->active_source;
+
+ if (op_ret <= 0) {
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "readdir of %s on subvolume %s failed (%s)",
+ local->loc.path,
+ priv->children[active_src]->name,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir of %s on subvolume %s complete",
+ local->loc.path,
+ priv->children[active_src]->name);
+ }
+
+ afr_sh_entry_impunge_all (frame, this);
+ return 0;
+ }
+
+ list_for_each_entry (entry, &entries->list, list) {
+ last_offset = entry->d_off;
+ entry_count++;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir'ed %d entries from %s",
+ entry_count, priv->children[active_src]->name);
+
+ sh->offset = last_offset;
+ local->call_count = entry_count;
+
+ list_for_each_entry (entry, &entries->list, list) {
+ afr_sh_entry_impunge_entry (frame, this, entry->d_name);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk,
+ priv->children[active_src],
+ priv->children[active_src]->fops->readdir,
+ sh->healing_fd, sh->block_size, sh->offset);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int active_src = -1;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sh->offset = 0;
+
+ active_src = next_active_source (frame, this, sh->active_source);
+ sh->active_source = active_src;
+
+ if (sh->op_failed) {
+ afr_sh_entry_finish (frame, this);
+ return 0;
+ }
+
+ if (active_src == -1) {
+ /* completed creating missing files on all subvolumes */
+ afr_sh_entry_expunge_all (frame, this);
+ return 0;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "impunging entries of %s on %s to other sinks",
+ local->loc.path, priv->children[active_src]->name);
+
+ afr_sh_entry_impunge_subvol (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ /* TODO: some of the open's might fail.
+ In that case, modify cleanup fn to send flush on those
+ fd's which are already open */
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "opendir of %s failed on child %s (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ sh->op_failed = 1;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed) {
+ afr_sh_entry_finish (frame, this);
+ return 0;
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "fd for %s opened, commencing sync",
+ local->loc.path);
+
+ sh->active_source = -1;
+ afr_sh_entry_impunge_all (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_open (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int call_count = 0;
+
+ int source = -1;
+ int *sources = NULL;
+
+ fd_t *fd = NULL;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = local->self_heal.source;
+ sources = local->self_heal.sources;
+
+ sh->block_size = 131072;
+ sh->offset = 0;
+
+ call_count = sh->active_sinks;
+ if (source != -1)
+ call_count++;
+
+ local->call_count = call_count;
+
+ fd = fd_create (local->loc.inode, frame->root->pid);
+ sh->healing_fd = fd;
+
+ if (source != -1) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "opening directory %s on subvolume %s (source)",
+ local->loc.path, priv->children[source]->name);
+
+ /* open source */
+ STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
+ (void *) (long) source,
+ priv->children[source],
+ priv->children[source]->fops->opendir,
+ &local->loc, fd);
+ call_count--;
+ }
+
+ /* open sinks */
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i] || !local->child_up[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "opening directory %s on subvolume %s (sink)",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->opendir,
+ &local->loc, fd);
+
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int active_sinks = 0;
+ int source = 0;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = sh->source;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] == 0 && local->child_up[i] == 1) {
+ active_sinks++;
+ sh->success[i] = 1;
+ }
+ }
+ if (source != -1)
+ sh->success[source] = 1;
+
+ if (active_sinks == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no active sinks for self-heal on dir %s",
+ local->loc.path);
+ afr_sh_entry_finish (frame, this);
+ return 0;
+ }
+ if (source == -1 && active_sinks < 2) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "cannot sync with 0 sources and 1 sink on dir %s",
+ local->loc.path);
+ afr_sh_entry_finish (frame, this);
+ return 0;
+ }
+ sh->active_sinks = active_sinks;
+
+ if (source != -1)
+ gf_log (this->name, GF_LOG_DEBUG,
+ "syncing %s from subvolume %s to %d active sinks",
+ local->loc.path, priv->children[source]->name,
+ active_sinks);
+ else
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no active sources for %s found. "
+ "merging all entries as a conservative decision",
+ local->loc.path);
+
+ afr_sh_entry_open (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_fix (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int source = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr,
+ priv->child_count, AFR_ENTRY_PENDING);
+
+ afr_sh_print_pending_matrix (sh->pending_matrix, this);
+
+
+ afr_sh_mark_sources (sh->pending_matrix, sh->sources,
+ priv->child_count);
+
+ afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
+ priv->child_count);
+
+ source = afr_sh_select_source (sh->sources, priv->child_count);
+ sh->source = source;
+
+ afr_sh_entry_sync_prepare (frame, this);
+
+ return 0;
+}
+
+
+
+int
+afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret != -1) {
+ sh->xattr[child_index] = dict_ref (xattr);
+ sh->buf[child_index] = *buf;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_entry_fix (frame, this);
+ }
+
+ return 0;
+}
+
+
+
+int
+afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this)
+{
+ afr_self_heal_t * sh = NULL;
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ dict_t *xattr_req = NULL;
+ int ret = 0;
+ int call_count = 0;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ xattr_req = dict_new();
+ if (xattr_req)
+ ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING,
+ priv->child_count * sizeof(int32_t));
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame,
+ afr_sh_entry_lookup_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &local->loc, xattr_req);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ return 0;
+}
+
+
+
+int
+afr_sh_entry_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+ /* TODO: what if lock fails? */
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ sh->op_failed = 1;
+
+ gf_log (this->name,
+ (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
+ "locking inode of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inode of %s on child %d locked",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed == 1) {
+ afr_sh_entry_finish (frame, this);
+ return 0;
+ }
+
+ afr_sh_entry_lookup (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_lock (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t * sh = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "locking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_entry_lock_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &local->loc, NULL,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_self_heal_entry (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ if (local->need_entry_self_heal && priv->entry_self_heal) {
+ afr_sh_entry_lock (frame, this);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to completion on %s",
+ local->loc.path);
+ afr_sh_entry_done (frame, this);
+ }
+
+ return 0;
+}
+
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
new file mode 100644
index 000000000..e65a426db
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -0,0 +1,791 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+#include "byte-order.h"
+
+#include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-self-heal-common.h"
+
+
+int
+afr_sh_metadata_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+// memset (sh->child_errno, 0, sizeof (int) * priv->child_count);
+ memset (sh->buf, 0, sizeof (struct stat) * priv->child_count);
+ memset (sh->success, 0, sizeof (int) * priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i])
+ dict_unref (sh->xattr[i]);
+ sh->xattr[i] = NULL;
+ }
+
+ if (local->govinda_gOvinda) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "aborting selfheal of %s",
+ local->loc.path);
+ sh->completion_cbk (frame, this);
+ } else {
+ if (S_ISREG (local->cont.lookup.buf.st_mode)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to data check on %s",
+ local->loc.path);
+ afr_self_heal_data (frame, this);
+ return 0;
+ }
+
+ if (S_ISDIR (local->cont.lookup.buf.st_mode)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to entry check on %s",
+ local->loc.path);
+ afr_self_heal_entry (frame, this);
+ return 0;
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "completed self heal of %s",
+ local->loc.path);
+
+ sh->completion_cbk (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ int call_count = 0;
+
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_metadata_done (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+ struct flock flock = {0, };
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ flock.l_start = 0;
+ flock.l_len = 0;
+ flock.l_type = F_UNLCK;
+
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unlocking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND (frame, afr_sh_metadata_unlck_cbk,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ &local->loc, F_SETLK, &flock);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_metadata_finish (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
+ dict_t **erase_xattr = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+
+ afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix,
+ sh->success, priv->child_count);
+
+ erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i]) {
+ call_count++;
+
+ erase_xattr[i] = get_new_dict();
+ dict_ref (erase_xattr[i]);
+ }
+ }
+
+ afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr,
+ priv->child_count, AFR_METADATA_PENDING);
+
+ local->call_count = call_count;
+
+ if (call_count == 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "metadata of %s not healed on any subvolume",
+ local->loc.path);
+
+ afr_sh_metadata_finish (frame, this);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!erase_xattr[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "erasing pending flags from %s on %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
+ if (!--call_count)
+ break;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (erase_xattr[i]) {
+ dict_unref (erase_xattr[i]);
+ }
+ }
+ FREE (erase_xattr);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting attributes failed for %s on %s (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+
+ sh->success[child_index] = 0;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_metadata_erase_pending (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int source = 0;
+ int active_sinks = 0;
+ int call_count = 0;
+ int i = 0;
+ struct timespec ts[2];
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = sh->source;
+ active_sinks = sh->active_sinks;
+
+ /*
+ * 4 calls per sink - chown, chmod, utimes, setxattr
+ */
+ if (xattr)
+ call_count = active_sinks * 4;
+ else
+ call_count = active_sinks * 3;
+
+ local->call_count = call_count;
+
+#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC
+ ts[0] = sh->buf[source].st_atim;
+ ts[1] = sh->buf[source].st_mtim;
+#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC
+ ts[0] = sh->buf[source].st_atimespec;
+ ts[1] = sh->buf[source].st_mtimespec;
+#else
+ ts[0].tv_sec = sh->buf[source].st_atime;
+ ts[1].tv_sec = sh->buf[source].st_mtime;
+#endif
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (call_count == 0) {
+ break;
+ }
+ if (sh->sources[i] || !local->child_up[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "syncing metadata of %s from %s to %s",
+ local->loc.path, priv->children[source]->name,
+ priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->chown,
+ &local->loc,
+ sh->buf[source].st_uid,
+ sh->buf[source].st_gid);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->chmod,
+ &local->loc, sh->buf[source].st_mode);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->utimens,
+ &local->loc, ts);
+
+ call_count = call_count - 3;
+
+ if (!xattr)
+ continue;
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->setxattr,
+ &local->loc, xattr, 0);
+ call_count--;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int source = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = sh->source;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "getxattr of %s failed on subvolume %s (%s). proceeding without xattr",
+ local->loc.path, priv->children[source]->name,
+ strerror (op_errno));
+
+ afr_sh_metadata_sync (frame, this, NULL);
+ } else {
+ dict_del (xattr, AFR_DATA_PENDING);
+ dict_del (xattr, AFR_METADATA_PENDING);
+ dict_del (xattr, AFR_ENTRY_PENDING);
+ afr_sh_metadata_sync (frame, this, xattr);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int active_sinks = 0;
+ int source = 0;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = sh->source;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] == 0 && local->child_up[i] == 1) {
+ active_sinks++;
+ sh->success[i] = 1;
+ }
+ }
+ sh->success[source] = 1;
+
+ if (active_sinks == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no active sinks for performing self-heal on file %s",
+ local->loc.path);
+ afr_sh_metadata_finish (frame, this);
+ return 0;
+ }
+ sh->active_sinks = active_sinks;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "syncing metadata of %s from subvolume %s to %d active sinks",
+ local->loc.path, priv->children[source]->name, active_sinks);
+
+ STACK_WIND (frame, afr_sh_metadata_getxattr_cbk,
+ priv->children[source],
+ priv->children[source]->fops->getxattr,
+ &local->loc, NULL);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int nsources = 0;
+ int source = 0;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr,
+ priv->child_count, AFR_METADATA_PENDING);
+
+ afr_sh_print_pending_matrix (sh->pending_matrix, this);
+
+ afr_sh_mark_sources (sh->pending_matrix, sh->sources,
+ priv->child_count);
+
+ afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
+ priv->child_count);
+
+ nsources = afr_sh_source_count (sh->sources, priv->child_count);
+
+ if ((nsources == 0)
+ && (priv->favorite_child != -1)
+ && (sh->child_errno[priv->favorite_child] == 0)) {
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "Picking favorite child %s as authentic source to resolve conflicting metadata of %s",
+ priv->children[priv->favorite_child]->name,
+ local->loc.path);
+
+ sh->sources[priv->favorite_child] = 1;
+
+ nsources = afr_sh_source_count (sh->sources,
+ priv->child_count);
+ }
+
+ if (nsources == 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to resolve conflicting metadata of %s. "
+ "Please resolve manually by fixing the "
+ "permissions/ownership of %s on your subvolumes. "
+ "You can also consider 'option favorite-child <>'",
+ local->loc.path, local->loc.path);
+
+ local->govinda_gOvinda = 1;
+
+ afr_sh_metadata_finish (frame, this);
+ return 0;
+ }
+
+ source = afr_sh_select_source (sh->sources, priv->child_count);
+ sh->source = source;
+
+ /* detect changes not visible through pending flags -- JIC */
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || sh->child_errno[i])
+ continue;
+
+ if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source]))
+ sh->sources[i] = 0;
+
+ if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source]))
+ sh->sources[i] = 0;
+ }
+
+ afr_sh_metadata_sync_prepare (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "path %s on subvolume %s is of mode 0%o",
+ local->loc.path,
+ priv->children[child_index]->name,
+ buf->st_mode);
+
+ sh->buf[child_index] = *buf;
+ if (xattr)
+ sh->xattr[child_index] = dict_ref (xattr);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "path %s on subvolume %s => -1 (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+
+ sh->child_errno[child_index] = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_metadata_fix (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+ dict_t *xattr_req = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+ local->call_count = call_count;
+
+ xattr_req = dict_new();
+
+ if (xattr_req)
+ ret = dict_set_uint64 (xattr_req, AFR_METADATA_PENDING,
+ priv->child_count * sizeof(int32_t));
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "looking up %s on %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_lookup_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &local->loc, xattr_req);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+ /* TODO: what if lock fails? */
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ sh->op_failed = 1;
+
+ gf_log (this->name,
+ (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
+ "locking of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inode of %s on child %d locked",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed) {
+ afr_sh_metadata_finish (frame, this);
+ return 0;
+ }
+
+ afr_sh_metadata_lookup (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+ struct flock flock = {0, };
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ flock.l_start = 0;
+ flock.l_len = 0;
+ flock.l_type = F_WRLCK;
+
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "locking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_lk_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ &local->loc, F_SETLK, &flock);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_self_heal_metadata (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = this->private;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ if (local->need_metadata_self_heal && priv->metadata_self_heal) {
+ afr_sh_metadata_lock (frame, this);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to data check on %s",
+ local->loc.path);
+ afr_sh_metadata_done (frame, this);
+ }
+
+ return 0;
+}
+
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
new file mode 100644
index 000000000..1c97a9bc1
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -0,0 +1,52 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __AFR_SELF_HEAL_H__
+#define __AFR_SELF_HEAL_H__
+
+#include <sys/stat.h>
+
+#define FILETYPE_DIFFERS(buf1,buf2) ((S_IFMT & ((struct stat *)buf1)->st_mode) != (S_IFMT & ((struct stat *)buf2)->st_mode))
+#define PERMISSION_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_mode) != (((struct stat *)buf2)->st_mode))
+#define OWNERSHIP_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_uid) != (((struct stat *)buf2)->st_uid) || (((struct stat *)buf1)->st_gid != (((struct stat *)buf2)->st_gid)))
+#define SIZE_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_size) != (((struct stat *)buf2)->st_size))
+
+
+
+int
+afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this);
+int
+afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this);
+int
+afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this);
+
+int
+afr_self_heal_entry (call_frame_t *frame, xlator_t *this);
+
+int
+afr_self_heal_data (call_frame_t *frame, xlator_t *this);
+
+int
+afr_self_heal_metadata (call_frame_t *frame, xlator_t *this);
+
+int
+afr_self_heal (call_frame_t *frame, xlator_t *this,
+ int (*completion_cbk) (call_frame_t *, xlator_t *));
+
+#endif /* __AFR_SELF_HEAL_H__ */
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
new file mode 100644
index 000000000..3df9f07e5
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -0,0 +1,957 @@
+/*
+ Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include "dict.h"
+#include "byte-order.h"
+
+#include "afr.h"
+#include "afr-transaction.h"
+
+#include <signal.h>
+
+
+static void
+__mark_all_pending (int32_t *pending, int child_count)
+{
+ int i;
+
+ for (i = 0; i < child_count; i++)
+ pending[i] = hton32 (1);
+}
+
+
+static void
+__mark_child_dead (int32_t *pending, int child_count, int child)
+{
+ pending[child] = 0;
+}
+
+
+static void
+__mark_down_children (int32_t *pending, int child_count, unsigned char *child_up)
+{
+ int i;
+
+ for (i = 0; i < child_count; i++)
+ if (!child_up[i])
+ pending[i] = 0;
+}
+
+
+static void
+__mark_all_success (int32_t *pending, int child_count)
+{
+ int i;
+
+ for (i = 0; i < child_count; i++)
+ pending[i] = hton32 (-1);
+}
+
+
+static int
+__is_first_write_on_fd (xlator_t *this, fd_t *fd)
+{
+ int op_ret = 0;
+ int _ret = -1;
+
+ _ret = fd_ctx_get (fd, this, NULL);
+ if (_ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "first writev() on fd=%p, writing changelog",
+ fd);
+
+ _ret = fd_ctx_set (fd, this, 0xaf1);
+ op_ret = 1;
+ }
+
+ return op_ret;
+}
+
+
+static int
+__changelog_enabled (afr_private_t *priv, afr_transaction_type type)
+{
+ int ret = 0;
+
+ switch (type) {
+ case AFR_DATA_TRANSACTION:
+ if (priv->data_change_log)
+ ret = 1;
+
+ break;
+
+ case AFR_METADATA_TRANSACTION:
+ if (priv->metadata_change_log)
+ ret = 1;
+
+ break;
+
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ if (priv->entry_change_log)
+ ret = 1;
+
+ break;
+
+ case AFR_FLUSH_TRANSACTION:
+ ret = 1;
+ }
+
+ return ret;
+}
+
+
+static int
+__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ fd_t * fd = NULL;
+
+ int op_ret = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (__changelog_enabled (priv, local->transaction.type)) {
+ switch (local->op) {
+
+ case GF_FOP_WRITE:
+ case GF_FOP_FTRUNCATE:
+ /*
+ if it's a data transaction, we write the changelog
+ only on the first write on an fd
+ */
+
+ fd = local->fd;
+ if (!fd || __is_first_write_on_fd (this, fd))
+ op_ret = 1;
+
+ break;
+
+ case GF_FOP_FLUSH:
+ /* only do post-op on flush() */
+
+ op_ret = 0;
+ break;
+
+ default:
+ op_ret = 1;
+ }
+ }
+
+ return op_ret;
+}
+
+
+static int
+__changelog_needed_post_op (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int ret = 0;
+ afr_transaction_type type = -1;
+
+ priv = this->private;
+ local = frame->local;
+ type = local->transaction.type;
+
+ if (__changelog_enabled (priv, type)
+ && (local->op != GF_FOP_WRITE)
+ && (local->op != GF_FOP_FTRUNCATE))
+ ret = 1;
+
+ return ret;
+}
+
+
+static int
+afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)
+{
+ int ret = 0;
+
+ switch (type) {
+ case AFR_FLUSH_TRANSACTION:
+ case AFR_DATA_TRANSACTION:
+ ret = priv->data_lock_server_count;
+ break;
+
+ case AFR_METADATA_TRANSACTION:
+ ret = priv->metadata_lock_server_count;
+ break;
+
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ ret = priv->entry_lock_server_count;
+ break;
+ }
+
+ return ret;
+}
+
+
+/* {{{ unlock */
+
+int32_t
+afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local;
+ int call_count = 0;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ call_count = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (call_count == 0) {
+ local->transaction.done (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_unlock (call_frame_t *frame, xlator_t *this)
+{
+ struct flock flock;
+
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t *local = NULL;
+ afr_private_t * priv = this->private;
+
+ local = frame->local;
+
+ call_count = afr_locked_nodes_count (local->transaction.locked_nodes,
+ priv->child_count);
+
+ if (call_count == 0) {
+ local->transaction.done (frame, this);
+ return 0;
+ }
+
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)
+ call_count *= 2;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ flock.l_start = local->transaction.start;
+ flock.l_len = local->transaction.len;
+ flock.l_type = F_UNLCK;
+
+ if (local->transaction.locked_nodes[i]) {
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ case AFR_FLUSH_TRANSACTION:
+
+ if (local->fd) {
+ STACK_WIND (frame, afr_unlock_common_cbk,
+ priv->children[i],
+ priv->children[i]->fops->finodelk,
+ local->fd, F_SETLK, &flock);
+ } else {
+ STACK_WIND (frame, afr_unlock_common_cbk,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ &local->loc, F_SETLK, &flock);
+ }
+
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+
+ STACK_WIND (frame, afr_unlock_common_cbk,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &local->transaction.new_parent_loc,
+ local->transaction.new_basename,
+ ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+
+ call_count--;
+
+ /* fall through */
+
+ case AFR_ENTRY_TRANSACTION:
+ if (local->fd) {
+ STACK_WIND (frame, afr_unlock_common_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fentrylk,
+ local->fd,
+ local->transaction.basename,
+ ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+ } else {
+ STACK_WIND (frame, afr_unlock_common_cbk,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+
+ }
+ break;
+ }
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+
+/* {{{ pending */
+
+int32_t
+afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int call_count = -1;
+
+ priv = this->private;
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ call_count = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (call_count == 0) {
+ if (afr_lock_server_count (priv, local->transaction.type) == 0) {
+ local->transaction.done (frame, this);
+ } else {
+ afr_unlock (frame, this);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = this->private;
+
+ int ret = 0;
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t * local = NULL;
+ dict_t * xattr = dict_ref (get_new_dict ());
+
+ local = frame->local;
+
+ __mark_all_success (local->pending_array, priv->child_count);
+ __mark_down_children (local->pending_array, priv->child_count, local->child_up);
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ call_count *= 2;
+ }
+
+ local->call_count = call_count;
+
+ if (call_count == 0) {
+ /* no child is up */
+ dict_unref (xattr);
+ afr_unlock (frame, this);
+ return 0;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ ret = dict_set_static_bin (xattr, local->transaction.pending,
+ local->pending_array,
+ priv->child_count * sizeof (int32_t));
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set pending entry");
+
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ case AFR_FLUSH_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ else
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ }
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ {
+ STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr);
+
+ call_count--;
+ }
+
+ /*
+ set it again because previous stack_wind
+ might have already returned (think of case
+ where subvolume is posix) and would have
+ used the dict as placeholder for return
+ value
+ */
+ ret = dict_set_static_bin (xattr, local->transaction.pending,
+ local->pending_array,
+ priv->child_count * sizeof (int32_t));
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set pending entry");
+
+ /* fall through */
+
+ case AFR_ENTRY_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ else
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ }
+ break;
+ }
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ dict_unref (xattr);
+ return 0;
+}
+
+
+int32_t
+afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = this->private;
+ loc_t * loc = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+
+ local = frame->local;
+ loc = &local->loc;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->child_up[child_index] = 0;
+
+ if (op_errno == ENOTSUP) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "xattrop not supported by %s",
+ priv->children[child_index]->name);
+ local->op_ret = -1;
+ } else if (!child_went_down (op_ret, op_errno)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "xattrop failed on child %s: %s",
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ }
+ local->op_errno = op_errno;
+ }
+
+ call_count = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (call_count == 0) {
+ if ((local->op_ret == -1) &&
+ (local->op_errno == ENOTSUP)) {
+ local->transaction.resume (frame, this);
+ } else {
+ local->transaction.fop (frame, this);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = this->private;
+
+ int i = 0;
+ int ret = 0;
+ int call_count = 0;
+ dict_t *xattr = NULL;
+
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+ xattr = get_new_dict ();
+ dict_ref (xattr);
+
+ call_count = afr_up_children_count (priv->child_count,
+ local->child_up);
+
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ call_count *= 2;
+ }
+
+ if (call_count == 0) {
+ /* no child is up */
+ dict_unref (xattr);
+ afr_unlock (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ __mark_all_pending (local->pending_array, priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ ret = dict_set_static_bin (xattr,
+ local->transaction.pending,
+ local->pending_array,
+ (priv->child_count *
+ sizeof (int32_t)));
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set pending entry");
+
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ case AFR_FLUSH_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ else
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &(local->loc),
+ GF_XATTROP_ADD_ARRAY, xattr);
+ }
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ {
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr);
+
+ call_count--;
+ }
+
+
+ /*
+ set it again because previous stack_wind
+ might have already returned (think of case
+ where subvolume is posix) and would have
+ used the dict as placeholder for return
+ value
+ */
+
+ ret = dict_set_static_bin (xattr, local->transaction.pending,
+ local->pending_array,
+ priv->child_count * sizeof (int32_t));
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set pending entry");
+
+ /* fall through */
+
+ case AFR_ENTRY_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ else
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ }
+
+ break;
+ }
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ dict_unref (xattr);
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ lock */
+
+static
+int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index);
+
+int32_t
+afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ int done = 0;
+ int child_index = (long) cookie;
+
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ /* wait for the other lock to return */
+ call_count = --local->call_count;
+ }
+
+ if (op_ret == -1) {
+ if (op_errno == ENOSYS) {
+ /* return ENOTSUP */
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume does not support locking. "
+ "please load features/posix-locks xlator on server");
+ local->op_ret = op_ret;
+ done = 1;
+ }
+
+ local->child_up[child_index] = 0;
+ local->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (call_count == 0) {
+ if ((local->op_ret == -1) &&
+ (local->op_errno == ENOSYS)) {
+ afr_unlock (frame, this);
+ } else {
+ local->transaction.locked_nodes[child_index] = 1;
+ local->transaction.lock_count++;
+ afr_lock_rec (frame, this, child_index + 1);
+ }
+ }
+
+ return 0;
+}
+
+
+static loc_t *
+lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2)
+{
+ int ret = 0;
+
+ ret = strcmp (l1->path, l2->path);
+
+ if (ret == 0)
+ ret = strcmp (b1, b2);
+
+ if (ret <= 0)
+ return l1;
+ else
+ return l2;
+}
+
+
+static
+int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ struct flock flock;
+
+ loc_t * lower = NULL;
+ loc_t * higher = NULL;
+
+ const char *lower_name = NULL;
+ const char *higher_name = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ flock.l_start = local->transaction.start;
+ flock.l_len = local->transaction.len;
+ flock.l_type = F_WRLCK;
+
+ /* skip over children that are down */
+ while ((child_index < priv->child_count)
+ && !local->child_up[child_index])
+ child_index++;
+
+ if ((child_index == priv->child_count) &&
+ local->transaction.lock_count == 0) {
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unable to lock on even one child");
+
+ local->op_ret = -1;
+ local->op_errno = EAGAIN;
+
+ local->transaction.done (frame, this);
+
+ return 0;
+
+ }
+
+ if ((child_index == priv->child_count)
+ || (local->transaction.lock_count ==
+ afr_lock_server_count (priv, local->transaction.type))) {
+
+ /* we're done locking */
+
+ if (__changelog_needed_pre_op (frame, this)) {
+ afr_changelog_pre_op (frame, this);
+ } else {
+ local->transaction.fop (frame, this);
+ }
+
+ return 0;
+ }
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ case AFR_FLUSH_TRANSACTION:
+
+ if (local->fd) {
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->finodelk,
+ local->fd, F_SETLKW, &flock);
+
+ } else {
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->inodelk,
+ &local->loc, F_SETLKW, &flock);
+ }
+
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ {
+ local->call_count = 2;
+
+ lower = lower_path (&local->transaction.parent_loc,
+ local->transaction.basename,
+ &local->transaction.new_parent_loc,
+ local->transaction.new_basename);
+
+ lower_name = (lower == &local->transaction.parent_loc ?
+ local->transaction.basename :
+ local->transaction.new_basename);
+
+ higher = (lower == &local->transaction.parent_loc ?
+ &local->transaction.new_parent_loc :
+ &local->transaction.parent_loc);
+
+ higher_name = (higher == &local->transaction.parent_loc ?
+ local->transaction.basename :
+ local->transaction.new_basename);
+
+
+ /* TODO: these locks should be blocking */
+
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->entrylk,
+ lower, lower_name,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK);
+
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->entrylk,
+ higher, higher_name,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK);
+
+ break;
+ }
+
+ case AFR_ENTRY_TRANSACTION:
+ if (local->fd) {
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->fentrylk,
+ local->fd,
+ local->transaction.basename,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK);
+ } else {
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->entrylk,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK);
+ }
+
+ break;
+ }
+
+ return 0;
+}
+
+
+int32_t afr_lock (call_frame_t *frame, xlator_t *this)
+{
+ return afr_lock_rec (frame, this, 0);
+}
+
+
+/* }}} */
+
+int32_t
+afr_transaction_resume (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (__changelog_needed_post_op (frame, this)) {
+ afr_changelog_post_op (frame, this);
+ } else {
+ if (afr_lock_server_count (priv, local->transaction.type) == 0) {
+ local->transaction.done (frame, this);
+ } else {
+ afr_unlock (frame, this);
+ }
+ }
+
+ return 0;
+}
+
+
+/**
+ * afr_transaction_child_died - inform that a child died during an fop
+ */
+
+void
+afr_transaction_child_died (call_frame_t *frame, xlator_t *this, int child_index)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ __mark_child_dead (local->pending_array, priv->child_count, child_index);
+}
+
+
+int32_t
+afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ afr_transaction_local_init (local, priv);
+
+ local->transaction.resume = afr_transaction_resume;
+ local->transaction.type = type;
+
+ if (afr_lock_server_count (priv, local->transaction.type) == 0) {
+ if (__changelog_needed_pre_op (frame, this)) {
+ afr_changelog_pre_op (frame, this);
+ } else {
+ local->transaction.fop (frame, this);
+ }
+ } else {
+ afr_lock (frame, this);
+ }
+
+ return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
new file mode 100644
index 000000000..49cdd219f
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -0,0 +1,36 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __TRANSACTION_H__
+#define __TRANSACTION_H__
+
+#define AFR_METADATA_PENDING "trusted.glusterfs.afr.metadata-pending"
+
+#define AFR_DATA_PENDING "trusted.glusterfs.afr.data-pending"
+
+#define AFR_ENTRY_PENDING "trusted.glusterfs.afr.entry-pending"
+
+void
+afr_transaction_child_died (call_frame_t *frame, xlator_t *this,
+ int child_index);
+
+int32_t
+afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type);
+
+#endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
new file mode 100644
index 000000000..e4c1a8479
--- /dev/null
+++ b/xlators/cluster/afr/src/afr.c
@@ -0,0 +1,2338 @@
+/*
+ Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+#include "byte-order.h"
+
+#include "afr-inode-read.h"
+#include "afr-inode-write.h"
+#include "afr-dir-read.h"
+#include "afr-dir-write.h"
+#include "afr-transaction.h"
+
+#include "afr-self-heal.h"
+
+
+/**
+ * afr_local_cleanup - cleanup everything in frame->local
+ */
+
+void
+afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)
+{
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+
+ sh = &local->self_heal;
+ priv = this->private;
+
+ if (sh->buf)
+ FREE (sh->buf);
+
+ if (sh->xattr) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i]) {
+ dict_unref (sh->xattr[i]);
+ sh->xattr[i] = NULL;
+ }
+ }
+ FREE (sh->xattr);
+ }
+
+ if (sh->child_errno)
+ FREE (sh->child_errno);
+
+ if (sh->pending_matrix) {
+ for (i = 0; i < priv->child_count; i++) {
+ FREE (sh->pending_matrix[i]);
+ }
+ FREE (sh->pending_matrix);
+ }
+
+ if (sh->delta_matrix) {
+ for (i = 0; i < priv->child_count; i++) {
+ FREE (sh->delta_matrix[i]);
+ }
+ FREE (sh->delta_matrix);
+ }
+
+ if (sh->sources)
+ FREE (sh->sources);
+
+ if (sh->success)
+ FREE (sh->success);
+
+ if (sh->healing_fd) {
+ fd_unref (sh->healing_fd);
+ sh->healing_fd = NULL;
+ }
+
+ loc_wipe (&sh->parent_loc);
+}
+
+
+void
+afr_local_cleanup (afr_local_t *local, xlator_t *this)
+{
+ if (!local)
+ return;
+
+ afr_local_sh_cleanup (local, this);
+
+ FREE (local->child_errno);
+ FREE (local->pending_array);
+
+ loc_wipe (&local->loc);
+ loc_wipe (&local->newloc);
+
+ FREE (local->transaction.locked_nodes);
+ FREE (local->transaction.child_errno);
+
+ FREE (local->transaction.basename);
+ FREE (local->transaction.new_basename);
+
+ loc_wipe (&local->transaction.parent_loc);
+ loc_wipe (&local->transaction.new_parent_loc);
+
+ if (local->fd)
+ fd_unref (local->fd);
+
+ if (local->xattr_req)
+ dict_unref (local->xattr_req);
+
+ FREE (local->child_up);
+
+ { /* lookup */
+ if (local->cont.lookup.xattr)
+ dict_unref (local->cont.lookup.xattr);
+ }
+
+ { /* getxattr */
+ if (local->cont.getxattr.name)
+ FREE (local->cont.getxattr.name);
+ }
+
+ { /* lk */
+ if (local->cont.lk.locked_nodes)
+ FREE (local->cont.lk.locked_nodes);
+ }
+
+ { /* checksum */
+ if (local->cont.checksum.file_checksum)
+ FREE (local->cont.checksum.file_checksum);
+ if (local->cont.checksum.dir_checksum)
+ FREE (local->cont.checksum.dir_checksum);
+ }
+
+ { /* create */
+ if (local->cont.create.fd)
+ fd_unref (local->cont.create.fd);
+ }
+
+ { /* writev */
+ FREE (local->cont.writev.vector);
+ }
+
+ { /* setxattr */
+ if (local->cont.setxattr.dict)
+ dict_unref (local->cont.setxattr.dict);
+ }
+
+ { /* removexattr */
+ FREE (local->cont.removexattr.name);
+ }
+
+ { /* symlink */
+ FREE (local->cont.symlink.linkpath);
+ }
+}
+
+
+int
+afr_frame_return (call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ call_count = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ return call_count;
+}
+
+/**
+ * first_up_child - return the index of the first child that is up
+ */
+
+int
+afr_first_up_child (afr_private_t *priv)
+{
+ xlator_t ** children = NULL;
+ int ret = -1;
+ int i = 0;
+
+ LOCK (&priv->lock);
+ {
+ children = priv->children;
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i]) {
+ ret = i;
+ break;
+ }
+ }
+ }
+ UNLOCK (&priv->lock);
+
+ return ret;
+}
+
+
+/**
+ * up_children_count - return the number of children that are up
+ */
+
+int
+afr_up_children_count (int child_count, unsigned char *child_up)
+{
+ int i = 0;
+ int ret = 0;
+
+ for (i = 0; i < child_count; i++)
+ if (child_up[i])
+ ret++;
+ return ret;
+}
+
+
+int
+afr_locked_nodes_count (unsigned char *locked_nodes, int child_count)
+{
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < child_count; i++)
+ if (locked_nodes[i])
+ ret++;
+
+ return ret;
+}
+
+
+ino64_t
+afr_itransform (ino64_t ino, int child_count, int child_index)
+{
+ ino64_t scaled_ino = -1;
+
+ if (ino == ((uint64_t) -1)) {
+ scaled_ino = ((uint64_t) -1);
+ goto out;
+ }
+
+ scaled_ino = (ino * child_count) + child_index;
+
+out:
+ return scaled_ino;
+}
+
+
+int
+afr_deitransform_orig (ino64_t ino, int child_count)
+{
+ int index = -1;
+
+ index = ino % child_count;
+
+ return index;
+}
+
+
+int
+afr_deitransform (ino64_t ino, int child_count)
+{
+ return 0;
+}
+
+
+int
+afr_self_heal_cbk (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ int ret = -1;
+
+ local = frame->local;
+
+ if (local->govinda_gOvinda) {
+ ret = inode_ctx_put (local->cont.lookup.inode, this, 1);
+
+ if (ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -ret;
+ }
+ } else {
+ inode_ctx_del (local->cont.lookup.inode, this, NULL);
+ }
+
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->cont.lookup.inode,
+ &local->cont.lookup.buf,
+ local->cont.lookup.xattr);
+
+ return 0;
+}
+
+
+int
+afr_lookup_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ struct stat * lookup_buf = NULL;
+ int call_count = -1;
+ int child_index = -1;
+ int prev_child_index = -1;
+ uint32_t open_fd_count = 0;
+ int ret = 0;
+
+ child_index = (long) cookie;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ local = frame->local;
+
+ lookup_buf = &local->cont.lookup.buf;
+
+ if (op_ret == -1) {
+ if (op_errno == ENOENT)
+ local->enoent_count++;
+
+ if (op_errno != ENOTCONN)
+ local->op_errno = op_errno;
+
+ goto unlock;
+ }
+
+ if (afr_sh_has_metadata_pending (xattr, child_index, this))
+ local->need_metadata_self_heal = 1;
+
+ if (afr_sh_has_entry_pending (xattr, child_index, this))
+ local->need_entry_self_heal = 1;
+
+ if (afr_sh_has_data_pending (xattr, child_index, this))
+ local->need_data_self_heal = 1;
+
+ ret = dict_get_uint32 (xattr, GLUSTERFS_OPEN_FD_COUNT,
+ &open_fd_count);
+ local->open_fd_count += open_fd_count;
+
+ /* in case of revalidate, we need to send stat of the
+ * child whose stat was sent during the first lookup.
+ * (so that time stamp does not vary with revalidate.
+ * in case it is down, stat of the fist success will
+ * be replied */
+
+ /* inode number should be preserved across revalidates */
+
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+
+ local->cont.lookup.inode = inode;
+ local->cont.lookup.xattr = dict_ref (xattr);
+
+ *lookup_buf = *buf;
+ lookup_buf->st_ino = afr_itransform (buf->st_ino,
+ priv->child_count,
+ child_index);
+ } else {
+ if (FILETYPE_DIFFERS (buf, lookup_buf)) {
+ /* mismatching filetypes with same name
+ -- Govinda !! GOvinda !!!
+ */
+ local->govinda_gOvinda = 1;
+ }
+
+ if (PERMISSION_DIFFERS (buf, lookup_buf)) {
+ /* mismatching permissions */
+ local->need_metadata_self_heal = 1;
+ }
+
+ if (OWNERSHIP_DIFFERS (buf, lookup_buf)) {
+ /* mismatching permissions */
+ local->need_metadata_self_heal = 1;
+ }
+
+ if (SIZE_DIFFERS (buf, lookup_buf)
+ && S_ISREG (buf->st_mode)) {
+ local->need_data_self_heal = 1;
+ }
+
+ prev_child_index = afr_deitransform_orig (lookup_buf->st_ino,
+ priv->child_count);
+ if (child_index < prev_child_index) {
+ *lookup_buf = *buf;
+ lookup_buf->st_ino = afr_itransform (buf->st_ino,
+ priv->child_count,
+ child_index);
+ }
+ }
+
+ local->success_count++;
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (local->op_ret == 0) {
+ /* KLUDGE: assuming DHT will not itransform in
+ revalidate */
+ if (local->cont.lookup.inode->ino)
+ lookup_buf->st_ino =
+ local->cont.lookup.inode->ino;
+ }
+
+ if (local->success_count && local->enoent_count) {
+ local->need_metadata_self_heal = 1;
+ local->need_data_self_heal = 1;
+ local->need_entry_self_heal = 1;
+ }
+
+ if (local->success_count) {
+ /* check for govinda_gOvinda case in previous lookup */
+ if (!inode_ctx_get (local->cont.lookup.inode,
+ this, NULL))
+ local->need_data_self_heal = 1;
+ }
+
+ if ((local->need_metadata_self_heal
+ || local->need_data_self_heal
+ || local->need_entry_self_heal)
+ && (!local->open_fd_count)) {
+
+ if (!local->cont.lookup.inode->st_mode) {
+ /* fix for RT #602 */
+ local->cont.lookup.inode->st_mode =
+ lookup_buf->st_mode;
+ }
+
+ afr_self_heal (frame, this, afr_self_heal_cbk);
+ } else {
+ AFR_STACK_UNWIND (frame, local->op_ret,
+ local->op_errno,
+ local->cont.lookup.inode,
+ &local->cont.lookup.buf,
+ local->cont.lookup.xattr);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_lookup (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *xattr_req)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int i = 0;
+ int32_t op_errno = 0;
+
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ local->op_ret = -1;
+
+ frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->reval_child_index = 0;
+
+ local->call_count = priv->child_count;
+
+ local->child_up = memdup (priv->child_up, priv->child_count);
+ local->child_count = afr_up_children_count (priv->child_count,
+ local->child_up);
+
+ /* By default assume ENOTCONN. On success it will be set to 0. */
+ local->op_errno = ENOTCONN;
+
+ if ((xattr_req == NULL)
+ && (priv->metadata_self_heal
+ || priv->data_self_heal
+ || priv->entry_self_heal))
+ local->xattr_req = dict_new ();
+ else
+ local->xattr_req = dict_ref (xattr_req);
+
+ if (priv->metadata_self_heal) {
+ ret = dict_set_uint64 (local->xattr_req, AFR_METADATA_PENDING,
+ priv->child_count * sizeof(int32_t));
+ }
+
+ if (priv->data_self_heal) {
+ ret = dict_set_uint64 (local->xattr_req, AFR_DATA_PENDING,
+ priv->child_count * sizeof(int32_t));
+ }
+
+ if (priv->entry_self_heal) {
+ ret = dict_set_uint64 (local->xattr_req, AFR_ENTRY_PENDING,
+ priv->child_count * sizeof(int32_t));
+ }
+
+ ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 0);
+
+ for (i = 0; i < priv->child_count; i++) {
+ STACK_WIND_COOKIE (frame, afr_lookup_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ loc, local->xattr_req);
+ }
+
+ ret = 0;
+out:
+ if (ret == -1)
+ AFR_STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+/* {{{ open */
+
+int
+afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = frame->local;
+
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->fd);
+ return 0;
+}
+
+
+int
+afr_open_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ fd_t *fd)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+
+ priv = this->private;
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ }
+
+ if (op_ret >= 0) {
+ local->op_ret = op_ret;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if ((local->cont.open.flags & O_TRUNC)
+ && (local->op_ret >= 0)) {
+ STACK_WIND (frame, afr_open_ftruncate_cbk,
+ this, this->fops->ftruncate,
+ fd, 0);
+ } else {
+ AFR_STACK_UNWIND (frame, local->op_ret,
+ local->op_errno, local->fd);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_open (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int i = 0;
+ int ret = -1;
+
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ int32_t wind_flags = flags & (~O_TRUNC);
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ priv = this->private;
+
+ ret = inode_ctx_get (loc->inode, this, NULL);
+ if (ret == 0) {
+ /* if ctx is set it means self-heal failed */
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "returning EIO, file has to be manually corrected "
+ "in backend");
+ op_errno = EIO;
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+ call_count = local->call_count;
+
+ local->cont.open.flags = flags;
+ local->fd = fd_ref (fd);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->open,
+ loc, wind_flags, fd);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, fd);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ flush */
+
+int
+afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_flush_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int i = 0;
+ int call_count = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_flush_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->flush,
+ local->fd);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_flush_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+afr_simple_flush_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+static int
+__is_fd_ctx_set (xlator_t *this, fd_t *fd)
+{
+ int _ret = 0;
+ int op_ret = 0;
+
+ _ret = fd_ctx_get (fd, this, NULL);
+ if (_ret == 0)
+ op_ret = 1;
+
+ return op_ret;
+}
+
+
+int
+afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int ret = -1;
+ int i = 0;
+ int call_count = 0;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+
+ if (__is_fd_ctx_set (this, fd)) {
+ local->op = GF_FOP_FLUSH;
+ local->transaction.fop = afr_flush_wind;
+ local->transaction.done = afr_flush_done;
+
+ local->fd = fd_ref (fd);
+
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+
+ local->transaction.pending = AFR_DATA_PENDING;
+
+ afr_transaction (frame, this, AFR_FLUSH_TRANSACTION);
+ } else {
+ /*
+ * if fd's ctx is not set, then there is no need
+ * to erase changelog. So just send the flush
+ */
+
+ call_count = local->call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_simple_flush_cbk,
+ priv->children[i],
+ priv->children[i]->fops->flush,
+ fd);
+
+ if (!--call_count)
+ break;
+ }
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ fsync */
+
+int
+afr_fsync_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t datasync)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_fsync_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fsync,
+ fd, datasync);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ fsync */
+
+int32_t
+afr_fsyncdir_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t datasync)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_fsync_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fsyncdir,
+ fd, datasync);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ xattrop */
+
+int32_t
+afr_xattrop_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr);
+
+ return 0;
+}
+
+
+int32_t
+afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t optype, dict_t *xattr)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_xattrop_cbk,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ loc, optype, xattr);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ fxattrop */
+
+int32_t
+afr_fxattrop_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr);
+
+ return 0;
+}
+
+
+int32_t
+afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_fxattrop_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ fd, optype, xattr);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+/* }}} */
+
+
+int32_t
+afr_inodelk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_inodelk (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int32_t cmd, struct flock *flock)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_inodelk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ loc, cmd, flock);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_finodelk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_finodelk (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t cmd, struct flock *flock)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_finodelk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->finodelk,
+ fd, cmd, flock);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_entrylk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *basename, entrylk_cmd cmd, entrylk_type type)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_entrylk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ loc, basename, cmd, type);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+
+int32_t
+afr_fentrylk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_fentrylk (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_fentrylk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fentrylk,
+ fd, basename, cmd, type);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_checksum_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ uint8_t *file_checksum, uint8_t *dir_checksum)
+
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0 && (local->op_ret != 0)) {
+ local->op_ret = 0;
+
+ local->cont.checksum.file_checksum = MALLOC (ZR_FILENAME_MAX);
+ memcpy (local->cont.checksum.file_checksum, file_checksum,
+ ZR_FILENAME_MAX);
+
+ local->cont.checksum.dir_checksum = MALLOC (ZR_FILENAME_MAX);
+ memcpy (local->cont.checksum.dir_checksum, dir_checksum,
+ ZR_FILENAME_MAX);
+
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->cont.checksum.file_checksum,
+ local->cont.checksum.dir_checksum);
+
+ return 0;
+}
+
+
+int32_t
+afr_checksum (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int32_t flag)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_checksum_cbk,
+ priv->children[i],
+ priv->children[i]->fops->checksum,
+ loc, flag);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_statfs_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct statvfs *statvfs)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = 0;
+
+ LOCK (&frame->lock);
+ {
+ local = frame->local;
+
+ if (op_ret == 0) {
+ local->op_ret = op_ret;
+
+ if (local->cont.statfs.buf_set) {
+ if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail)
+ local->cont.statfs.buf = *statvfs;
+ } else {
+ local->cont.statfs.buf = *statvfs;
+ local->cont.statfs.buf_set = 1;
+ }
+ }
+
+ if (op_ret == -1)
+ local->op_errno = op_errno;
+
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->cont.statfs.buf);
+
+ return 0;
+}
+
+
+int32_t
+afr_statfs (call_frame_t *frame, xlator_t *this,
+ loc_t *loc)
+{
+ afr_private_t * priv = NULL;
+ int child_count = 0;
+ afr_local_t * local = NULL;
+ int i = 0;
+
+ int ret = -1;
+ int call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ priv = this->private;
+ child_count = priv->child_count;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+ call_count = local->call_count;
+
+ for (i = 0; i < child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_statfs_cbk,
+ priv->children[i],
+ priv->children[i]->fops->statfs,
+ loc);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct flock *lock)
+{
+ afr_local_t * local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ lock);
+
+ return 0;
+}
+
+
+int32_t
+afr_lk_unlock (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int i;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes,
+ priv->child_count);
+
+ if (call_count == 0) {
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->cont.lk.flock);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ local->cont.lk.flock.l_type = F_UNLCK;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->cont.lk.locked_nodes[i]) {
+ STACK_WIND (frame, afr_lk_unlock_cbk,
+ priv->children[i],
+ priv->children[i]->fops->lk,
+ local->fd, F_SETLK,
+ &local->cont.lk.flock);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct flock *lock)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ call_count = --local->call_count;
+
+ if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+
+ afr_lk_unlock (frame, this);
+ return 0;
+ }
+
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->cont.lk.flock = *lock;
+ local->cont.lk.locked_nodes[child_index] = 1;
+ }
+
+ child_index++;
+
+ if (child_index < priv->child_count) {
+ STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->lk,
+ local->fd, local->cont.lk.cmd,
+ &local->cont.lk.flock);
+ } else if (local->op_ret == -1) {
+ /* all nodes have gone down */
+
+ AFR_STACK_UNWIND (frame, -1, ENOTCONN, &local->cont.lk.flock);
+ } else {
+ /* locking has succeeded on all nodes that are up */
+
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->cont.lk.flock);
+ }
+
+ return 0;
+}
+
+
+int
+afr_lk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t cmd,
+ struct flock *flock)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int i = 0;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ AFR_LOCAL_INIT (local, priv);
+
+ frame->local = local;
+
+ local->cont.lk.locked_nodes = CALLOC (priv->child_count,
+ sizeof (*local->cont.lk.locked_nodes));
+
+ if (!local->cont.lk.locked_nodes) {
+ gf_log (this->name, GF_LOG_ERROR, "out of memory :(");
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->fd = fd_ref (fd);
+ local->cont.lk.cmd = cmd;
+ local->cont.lk.flock = *flock;
+
+ STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0,
+ priv->children[i],
+ priv->children[i]->fops->lk,
+ fd, cmd, flock);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+ return 0;
+}
+
+
+/**
+ * find_child_index - find the child's index in the array of subvolumes
+ * @this: AFR
+ * @child: child
+ */
+
+static int
+find_child_index (xlator_t *this, xlator_t *child)
+{
+ afr_private_t *priv = NULL;
+
+ int i = -1;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((xlator_t *) child == priv->children[i])
+ break;
+ }
+
+ return i;
+}
+
+
+int32_t
+notify (xlator_t *this, int32_t event,
+ void *data, ...)
+{
+ afr_private_t * priv = NULL;
+ unsigned char * child_up = NULL;
+
+ int i = -1;
+ int up_children = 0;
+
+ priv = this->private;
+
+ if (!priv)
+ return 0;
+
+ child_up = priv->child_up;
+
+ switch (event) {
+ case GF_EVENT_CHILD_UP:
+ i = find_child_index (this, data);
+
+ child_up[i] = 1;
+
+ /*
+ if all the children were down, and one child came up,
+ send notify to parent
+ */
+
+ for (i = 0; i < priv->child_count; i++)
+ if (child_up[i])
+ up_children++;
+
+ if (up_children == 1)
+ default_notify (this, event, data);
+
+ break;
+
+ case GF_EVENT_CHILD_DOWN:
+ i = find_child_index (this, data);
+
+ child_up[i] = 0;
+
+ /*
+ if all children are down, and this was the last to go down,
+ send notify to parent
+ */
+
+ for (i = 0; i < priv->child_count; i++)
+ if (child_up[i])
+ up_children++;
+
+ if (up_children == 0)
+ default_notify (this, event, data);
+
+ break;
+
+ default:
+ default_notify (this, event, data);
+ }
+
+ return 0;
+}
+
+
+static const char *favorite_child_warning_str = "You have specified subvolume '%s' "
+ "as the 'favorite child'. This means that if a discrepancy in the content "
+ "or attributes (ownership, permission, etc.) of a file is detected among "
+ "the subvolumes, the file on '%s' will be considered the definitive "
+ "version and its contents will OVERWRITE the contents of the file on other "
+ "subvolumes. All versions of the file except that on '%s' "
+ "WILL BE LOST.";
+
+static const char *no_lock_servers_warning_str = "You have set lock-server-count = 0. "
+ "This means correctness is NO LONGER GUARANTEED in all cases. If two or more "
+ "applications write to the same region of a file, there is a possibility that "
+ "its copies will be INCONSISTENT. Set it to a value greater than 0 unless you "
+ "are ABSOLUTELY SURE of what you are doing and WILL NOT HOLD GlusterFS "
+ "RESPOSIBLE for inconsistent data. If you are in doubt, set it to a value "
+ "greater than 0.";
+
+int32_t
+init (xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ int child_count = 0;
+ xlator_list_t * trav = NULL;
+ int i = 0;
+ int ret = -1;
+ int op_errno = 0;
+
+ char * read_subvol = NULL;
+ char * fav_child = NULL;
+ char * self_heal = NULL;
+ char * change_log = NULL;
+
+ int32_t lock_server_count = 1;
+
+ int fav_ret = -1;
+ int read_ret = -1;
+ int dict_ret = -1;
+
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "AFR needs more than one child defined");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ ALLOC_OR_GOTO (this->private, afr_private_t, out);
+
+ priv = this->private;
+
+ read_ret = dict_get_str (this->options, "read-subvolume", &read_subvol);
+ priv->read_child = -1;
+
+ fav_ret = dict_get_str (this->options, "favorite-child", &fav_child);
+ priv->favorite_child = -1;
+
+ /* Default values */
+
+ priv->data_self_heal = 1;
+ priv->metadata_self_heal = 1;
+ priv->entry_self_heal = 1;
+
+ dict_ret = dict_get_str (this->options, "data-self-heal", &self_heal);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (self_heal, &priv->data_self_heal);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option data-self-heal %s' "
+ "defaulting to data-self-heal as 'on'",
+ self_heal);
+ priv->data_self_heal = 1;
+ }
+ }
+
+ dict_ret = dict_get_str (this->options, "metadata-self-heal",
+ &self_heal);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (self_heal, &priv->metadata_self_heal);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option metadata-self-heal %s' "
+ "defaulting to metadata-self-heal as 'on'",
+ self_heal);
+ priv->metadata_self_heal = 1;
+ }
+ }
+
+ dict_ret = dict_get_str (this->options, "entry-self-heal", &self_heal);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (self_heal, &priv->entry_self_heal);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option entry-self-heal %s' "
+ "defaulting to entry-self-heal as 'on'",
+ self_heal);
+ priv->entry_self_heal = 1;
+ }
+ }
+
+ /* Change log options */
+
+ priv->data_change_log = 1;
+ priv->metadata_change_log = 0;
+ priv->entry_change_log = 1;
+
+ dict_ret = dict_get_str (this->options, "data-change-log",
+ &change_log);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (change_log, &priv->data_change_log);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option data-change-log %s'. "
+ "defaulting to data-change-log as 'on'",
+ change_log);
+ priv->data_change_log = 1;
+ }
+ }
+
+ dict_ret = dict_get_str (this->options, "metadata-change-log",
+ &change_log);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (change_log,
+ &priv->metadata_change_log);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option metadata-change-log %s'. "
+ "defaulting to metadata-change-log as 'off'",
+ change_log);
+ priv->metadata_change_log = 0;
+ }
+ }
+
+ dict_ret = dict_get_str (this->options, "entry-change-log",
+ &change_log);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (change_log, &priv->entry_change_log);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option entry-change-log %s'. "
+ "defaulting to entry-change-log as 'on'",
+ change_log);
+ priv->entry_change_log = 1;
+ }
+ }
+
+ /* Locking options */
+
+ priv->data_lock_server_count = 1;
+ priv->metadata_lock_server_count = 0;
+ priv->entry_lock_server_count = 1;
+
+ dict_ret = dict_get_int32 (this->options, "data-lock-server-count",
+ &lock_server_count);
+ if (dict_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting data lock server count to %d",
+ lock_server_count);
+
+ if (lock_server_count == 0)
+ gf_log (this->name, GF_LOG_WARNING,
+ no_lock_servers_warning_str);
+
+ priv->data_lock_server_count = lock_server_count;
+ }
+
+
+ dict_ret = dict_get_int32 (this->options,
+ "metadata-lock-server-count",
+ &lock_server_count);
+ if (dict_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting metadata lock server count to %d",
+ lock_server_count);
+ priv->metadata_lock_server_count = lock_server_count;
+ }
+
+
+ dict_ret = dict_get_int32 (this->options, "entry-lock-server-count",
+ &lock_server_count);
+ if (dict_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting entry lock server count to %d",
+ lock_server_count);
+
+ priv->entry_lock_server_count = lock_server_count;
+ }
+
+
+ trav = this->children;
+ while (trav) {
+ if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume '%s' specified as read child",
+ trav->xlator->name);
+
+ priv->read_child = child_count;
+ }
+
+ if (fav_ret == 0 && !strcmp (fav_child, trav->xlator->name)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ favorite_child_warning_str, trav->xlator->name,
+ trav->xlator->name, trav->xlator->name);
+ priv->favorite_child = child_count;
+ }
+
+ child_count++;
+ trav = trav->next;
+ }
+
+ /* XXX: return inode numbers from 1st subvolume till
+ afr supports read-subvolume based on inode's ctx
+ (and not itransform) for this reason afr_deitransform()
+ returns 0 always
+ */
+ priv->read_child = 0;
+
+ priv->wait_count = 1;
+
+ priv->child_count = child_count;
+ LOCK_INIT (&priv->lock);
+
+ priv->child_up = CALLOC (sizeof (unsigned char), child_count);
+ if (!priv->child_up) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ priv->children = CALLOC (sizeof (xlator_t *), child_count);
+ if (!priv->children) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ trav = this->children;
+ i = 0;
+ while (i < child_count) {
+ priv->children[i] = trav->xlator;
+
+ trav = trav->next;
+ i++;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+int
+fini (xlator_t *this)
+{
+ return 0;
+}
+
+
+struct xlator_fops fops = {
+ .lookup = afr_lookup,
+ .open = afr_open,
+ .lk = afr_lk,
+ .flush = afr_flush,
+ .statfs = afr_statfs,
+ .fsync = afr_fsync,
+ .fsyncdir = afr_fsyncdir,
+ .xattrop = afr_xattrop,
+ .fxattrop = afr_fxattrop,
+ .inodelk = afr_inodelk,
+ .finodelk = afr_finodelk,
+ .entrylk = afr_entrylk,
+ .fentrylk = afr_fentrylk,
+ .checksum = afr_checksum,
+
+ /* inode read */
+ .access = afr_access,
+ .stat = afr_stat,
+ .fstat = afr_fstat,
+ .readlink = afr_readlink,
+ .getxattr = afr_getxattr,
+ .readv = afr_readv,
+
+ /* inode write */
+ .chmod = afr_chmod,
+ .chown = afr_chown,
+ .fchmod = afr_fchmod,
+ .fchown = afr_fchown,
+ .writev = afr_writev,
+ .truncate = afr_truncate,
+ .ftruncate = afr_ftruncate,
+ .utimens = afr_utimens,
+ .setxattr = afr_setxattr,
+ .removexattr = afr_removexattr,
+
+ /* dir read */
+ .opendir = afr_opendir,
+ .readdir = afr_readdir,
+ .getdents = afr_getdents,
+
+ /* dir write */
+ .create = afr_create,
+ .mknod = afr_mknod,
+ .mkdir = afr_mkdir,
+ .unlink = afr_unlink,
+ .rmdir = afr_rmdir,
+ .link = afr_link,
+ .symlink = afr_symlink,
+ .rename = afr_rename,
+ .setdents = afr_setdents,
+};
+
+
+struct xlator_mops mops = {
+};
+
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {"read-subvolume" },
+ .type = GF_OPTION_TYPE_XLATOR
+ },
+ { .key = {"favorite-child"},
+ .type = GF_OPTION_TYPE_XLATOR
+ },
+ { .key = {"data-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"metadata-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"entry-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"data-change-log"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"metadata-change-log"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"entry-change-log"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"data-lock-server-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0
+ },
+ { .key = {"metadata-lock-server-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0
+ },
+ { .key = {"entry-lock-server-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
new file mode 100644
index 000000000..4cf6cdf9d
--- /dev/null
+++ b/xlators/cluster/afr/src/afr.h
@@ -0,0 +1,523 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#ifndef __AFR_H__
+#define __AFR_H__
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "scheduler.h"
+#include "call-stub.h"
+#include "compat-errno.h"
+
+
+typedef struct _afr_private {
+ gf_lock_t lock; /* to guard access to child_count, etc */
+ unsigned int child_count; /* total number of children */
+
+ xlator_t **children;
+
+ unsigned char *child_up;
+
+ gf_boolean_t data_self_heal; /* on/off */
+ gf_boolean_t metadata_self_heal; /* on/off */
+ gf_boolean_t entry_self_heal; /* on/off */
+
+
+ gf_boolean_t data_change_log; /* on/off */
+ gf_boolean_t metadata_change_log; /* on/off */
+ gf_boolean_t entry_change_log; /* on/off */
+
+ unsigned int read_child; /* read-subvolume */
+ unsigned int favorite_child; /* subvolume to be preferred in resolving
+ split-brain cases */
+
+ unsigned int data_lock_server_count;
+ unsigned int metadata_lock_server_count;
+ unsigned int entry_lock_server_count;
+
+ unsigned int wait_count; /* # of servers to wait for success */
+} afr_private_t;
+
+typedef struct {
+ /* array of stat's, one for each child */
+ struct stat *buf;
+
+ /* array of xattr's, one for each child */
+ dict_t **xattr;
+
+ /* array of errno's, one for each child */
+ int *child_errno;
+
+ int32_t **pending_matrix;
+ int32_t **delta_matrix;
+
+ int *sources;
+ int source;
+ int active_source;
+ int active_sinks;
+ int *success;
+
+ fd_t *healing_fd;
+ int op_failed;
+
+ int file_has_holes;
+ blksize_t block_size;
+ off_t file_size;
+ off_t offset;
+
+ loc_t parent_loc;
+ int (*completion_cbk) (call_frame_t *frame, xlator_t *this);
+ call_frame_t *sh_frame;
+} afr_self_heal_t;
+
+
+typedef enum {
+ AFR_DATA_TRANSACTION, /* truncate, write, ... */
+ AFR_METADATA_TRANSACTION, /* chmod, chown, ... */
+ AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */
+ AFR_ENTRY_RENAME_TRANSACTION, /* rename */
+ AFR_FLUSH_TRANSACTION, /* flush */
+} afr_transaction_type;
+
+typedef struct _afr_local {
+ unsigned int call_count;
+ unsigned int success_count;
+ unsigned int enoent_count;
+
+ unsigned int need_metadata_self_heal;
+ unsigned int need_entry_self_heal;
+ unsigned int need_data_self_heal;
+ unsigned int govinda_gOvinda;
+
+ unsigned int reval_child_index;
+ int32_t op_ret;
+ int32_t op_errno;
+
+ int32_t *pending_array;
+
+ loc_t loc;
+ loc_t newloc;
+
+ fd_t *fd;
+
+ glusterfs_fop_t fop;
+
+ unsigned char *child_up;
+ int child_count;
+
+ int32_t *child_errno;
+
+ dict_t *xattr_req;
+ int open_fd_count;
+ /*
+ This struct contains the arguments for the "continuation"
+ (scheme-like) of fops
+ */
+
+ int op;
+ struct {
+ struct {
+ unsigned char buf_set;
+ struct statvfs buf;
+ } statfs;
+
+ struct {
+ inode_t *inode;
+ struct stat buf;
+ dict_t *xattr;
+ } lookup;
+
+ struct {
+ int32_t flags;
+ } open;
+
+ struct {
+ int32_t cmd;
+ struct flock flock;
+ unsigned char *locked_nodes;
+ } lk;
+
+ struct {
+ uint8_t *file_checksum;
+ uint8_t *dir_checksum;
+ } checksum;
+
+ /* inode read */
+
+ struct {
+ int32_t mask;
+ int last_tried; /* index of the child we tried previously */
+ } access;
+
+ struct {
+ int last_tried;
+ ino_t ino;
+ } stat;
+
+ struct {
+ int last_tried;
+ ino_t ino;
+ } fstat;
+
+ struct {
+ size_t size;
+ int last_tried;
+ } readlink;
+
+ struct {
+ const char *name;
+ int last_tried;
+ } getxattr;
+
+ struct {
+ size_t size;
+ off_t offset;
+ int last_tried;
+ } readv;
+
+ /* dir read */
+
+ struct {
+ int success_count;
+ int32_t op_ret;
+ int32_t op_errno;
+ } opendir;
+
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+ size_t size;
+ off_t offset;
+
+ int last_tried;
+ } readdir;
+
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+
+ size_t size;
+ off_t offset;
+ int32_t flag;
+
+ int last_tried;
+ } getdents;
+
+ /* inode write */
+
+ struct {
+ ino_t ino;
+ mode_t mode;
+ struct stat buf;
+ } chmod;
+
+ struct {
+ ino_t ino;
+ mode_t mode;
+ struct stat buf;
+ } fchmod;
+
+ struct {
+ ino_t ino;
+ uid_t uid;
+ gid_t gid;
+ struct stat buf;
+ } chown;
+
+ struct {
+ ino_t ino;
+ uid_t uid;
+ gid_t gid;
+ struct stat buf;
+ } fchown;
+
+ struct {
+ ino_t ino;
+ struct stat buf;
+
+ int32_t op_ret;
+
+ struct iovec *vector;
+ dict_t *refs;
+ int32_t count;
+ off_t offset;
+ } writev;
+
+ struct {
+ ino_t ino;
+ off_t offset;
+ struct stat buf;
+ } truncate;
+
+ struct {
+ ino_t ino;
+ off_t offset;
+ struct stat buf;
+ } ftruncate;
+
+ struct {
+ ino_t ino;
+ struct timespec tv[2];
+ struct stat buf;
+ } utimens;
+
+ struct {
+ dict_t *dict;
+ int32_t flags;
+ } setxattr;
+
+ struct {
+ const char *name;
+ } removexattr;
+
+ /* dir write */
+
+ struct {
+ ino_t ino;
+ fd_t *fd;
+ int32_t flags;
+ mode_t mode;
+ inode_t *inode;
+ struct stat buf;
+ } create;
+
+ struct {
+ ino_t ino;
+ dev_t dev;
+ mode_t mode;
+ inode_t *inode;
+ struct stat buf;
+ } mknod;
+
+ struct {
+ ino_t ino;
+ int32_t mode;
+ inode_t *inode;
+ struct stat buf;
+ } mkdir;
+
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+ } unlink;
+
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+ } rmdir;
+
+ struct {
+ ino_t ino;
+ struct stat buf;
+ } rename;
+
+ struct {
+ ino_t ino;
+ inode_t *inode;
+ struct stat buf;
+ } link;
+
+ struct {
+ ino_t ino;
+ inode_t *inode;
+ struct stat buf;
+ char *linkpath;
+ } symlink;
+
+ struct {
+ int32_t flags;
+ dir_entry_t *entries;
+ int32_t count;
+ } setdents;
+ } cont;
+
+ struct {
+ off_t start, len;
+
+ unsigned char *locked_nodes;
+ int lock_count;
+
+ const char *basename;
+ const char *new_basename;
+
+ char *pending;
+
+ loc_t parent_loc;
+ loc_t new_parent_loc;
+
+ afr_transaction_type type;
+
+ int success_count;
+ int erase_pending;
+ int failure_count;
+
+ int last_tried;
+ int32_t *child_errno;
+
+ call_frame_t *main_frame;
+
+ int (*fop) (call_frame_t *frame, xlator_t *this);
+
+ int (*done) (call_frame_t *frame, xlator_t *this);
+
+ int (*resume) (call_frame_t *frame, xlator_t *this);
+
+ int (*unwind) (call_frame_t *frame, xlator_t *this);
+ } transaction;
+
+ afr_self_heal_t self_heal;
+} afr_local_t;
+
+/* try alloc and if it fails, goto label */
+#define ALLOC_OR_GOTO(var, type, label) do { \
+ var = CALLOC (sizeof (type), 1); \
+ if (!var) { \
+ gf_log (this->name, GF_LOG_ERROR, \
+ "out of memory :("); \
+ op_errno = ENOMEM; \
+ goto label; \
+ } \
+ } while (0);
+
+
+/* did a call fail due to a child failing? */
+#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
+ ((op_errno == ENOTCONN) || \
+ (op_errno == EBADFD)))
+
+/* have we tried all children? */
+#define all_tried(i, count) ((i) == (count) - 1)
+
+void
+afr_build_parent_loc (loc_t *parent, loc_t *child);
+
+int
+afr_up_children_count (int child_count, unsigned char *child_up);
+
+int
+afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
+
+int
+afr_first_up_child (afr_private_t *priv);
+
+ino64_t
+afr_itransform (ino64_t ino, int child_count, int child_index);
+
+int
+afr_deitransform (ino64_t ino, int child_count);
+
+void
+afr_local_cleanup (afr_local_t *local, xlator_t *this);
+
+int
+afr_frame_return (call_frame_t *frame);
+
+#define AFR_STACK_UNWIND(frame, params ...) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ STACK_UNWIND (frame, params); \
+ afr_local_cleanup (__local, __this); \
+ free (__local); \
+} while (0);
+
+#define AFR_STACK_DESTROY(frame) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ STACK_DESTROY (frame->root); \
+ afr_local_cleanup (__local, __this); \
+ free (__local); \
+} while (0);
+
+/* allocate and return a string that is the basename of argument */
+static inline char *
+AFR_BASENAME (const char *str)
+{
+ char *__tmp_str = NULL;
+ char *__basename_str = NULL;
+ __tmp_str = strdup (str);
+ __basename_str = strdup (basename (__tmp_str));
+ FREE (__tmp_str);
+ return __basename_str;
+}
+
+/* initialize local_t */
+static inline int
+AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv)
+{
+ local->child_up = CALLOC (sizeof (*local->child_up),
+ priv->child_count);
+ if (!local->child_up) {
+ return -ENOMEM;
+ }
+
+ memcpy (local->child_up, priv->child_up,
+ sizeof (*local->child_up) * priv->child_count);
+
+
+ local->call_count = afr_up_children_count (priv->child_count, local->child_up);
+ if (local->call_count == 0)
+ return -ENOTCONN;
+
+ local->transaction.erase_pending = 1;
+
+ local->op_ret = -1;
+ local->op_errno = EUCLEAN;
+
+ return 0;
+}
+
+
+static inline int
+afr_transaction_local_init (afr_local_t *local, afr_private_t *priv)
+{
+ local->child_errno = CALLOC (sizeof (*local->child_errno),
+ priv->child_count);
+ if (!local->child_errno) {
+ return -ENOMEM;
+ }
+
+ local->pending_array = CALLOC (sizeof (*local->pending_array),
+ priv->child_count);
+ if (!local->pending_array) {
+ return -ENOMEM;
+ }
+
+ local->transaction.locked_nodes = CALLOC (sizeof (*local->transaction.locked_nodes),
+ priv->child_count);
+
+ local->transaction.child_errno = CALLOC (sizeof (*local->transaction.child_errno),
+ priv->child_count);
+
+ return 0;
+}
+
+#endif /* __AFR_H__ */
diff --git a/xlators/cluster/dht/Makefile.am b/xlators/cluster/dht/Makefile.am
new file mode 100644
index 000000000..f963effea
--- /dev/null
+++ b/xlators/cluster/dht/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = src \ No newline at end of file
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am
new file mode 100644
index 000000000..b7d07d137
--- /dev/null
+++ b/xlators/cluster/dht/src/Makefile.am
@@ -0,0 +1,30 @@
+
+xlator_LTLIBRARIES = dht.la nufa.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+
+dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c \
+ dht-selfheal.c dht-rename.c dht-hashfn.c dht-hashfn-tea.c
+
+dht_la_SOURCES = $(dht_common_source) dht.c
+
+nufa_la_SOURCES = $(dht_common_source) nufa.c
+
+dht_la_LDFLAGS = -module -avoidversion
+dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+nufa_la_LDFLAGS = -module -avoidversion
+nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = dht-common.h dht-common.c
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES =
+
+uninstall-local:
+ rm -f $(DESTDIR)$(xlatordir)/distribute.so
+
+install-data-hook:
+ ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so \ No newline at end of file
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
new file mode 100644
index 000000000..5e4979e31
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -0,0 +1,3470 @@
+/*
+ Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+/* TODO: add NS locking */
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "dht-common.h"
+#include "defaults.h"
+
+
+/* TODO:
+ - use volumename in xattr instead of "dht"
+ - use NS locks
+ - handle all cases in self heal layout reconstruction
+ - complete linkfile selfheal
+*/
+
+int
+dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int op_ret, int op_errno)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ ret = op_ret;
+
+ if (ret == 0) {
+ layout = local->selfheal.layout;
+ ret = inode_ctx_put (local->inode, this, (uint64_t)(long)layout);
+
+ if (ret == 0)
+ local->selfheal.layout = NULL;
+
+ if (local->st_ino) {
+ local->stbuf.st_ino = local->st_ino;
+ } else {
+ gf_log (this->name, GF_LOG_WARNING,
+ "could not find hashed subvolume for %s",
+ local->loc.path);
+ }
+ }
+
+ DHT_STACK_UNWIND (frame, ret, local->op_errno, local->inode,
+ &local->stbuf, local->xattr);
+
+ return 0;
+}
+
+
+int
+dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf, dict_t *xattr)
+{
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ call_frame_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = 0;
+ int is_dir = 0;
+
+ conf = this->private;
+ local = frame->local;
+ prev = cookie;
+
+ layout = local->layout;
+
+ LOCK (&frame->lock);
+ {
+ /* TODO: assert equal mode on stbuf->st_mode and
+ local->stbuf->st_mode
+
+ else mkdir/chmod/chown and fix
+ */
+ /* TODO: assert equal hash type in xattr, local->xattr */
+
+ /* TODO: always ensure same subvolume is in layout->list[0] */
+
+ ret = dht_layout_merge (this, layout, prev->this,
+ op_ret, op_errno, xattr);
+
+ if (op_ret == -1) {
+ local->op_errno = ENOENT;
+ gf_log (this->name, GF_LOG_WARNING,
+ "lookup of %s on %s returned error (%s)",
+ local->loc.path, prev->this->name,
+ strerror (op_errno));
+
+ goto unlock;
+ }
+
+ is_dir = check_is_dir (inode, stbuf, xattr);
+ if (!is_dir)
+ goto unlock;
+
+ local->op_ret = 0;
+ if (local->xattr == NULL)
+ local->xattr = dict_ref (xattr);
+ if (local->inode == NULL)
+ local->inode = inode_ref (inode);
+
+ dht_stat_merge (this, &local->stbuf, stbuf, prev->this);
+
+ if (prev->this == local->hashed_subvol)
+ local->st_ino = local->stbuf.st_ino;
+
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ if (local->op_ret == 0) {
+ ret = dht_layout_normalize (this, &local->loc, layout);
+
+ local->layout = NULL;
+
+ if (ret != 0) {
+ layout->gen = conf->gen;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "fixing assignment on %s",
+ local->loc.path);
+ goto selfheal;
+ }
+
+ inode_ctx_put (local->inode, this, (uint64_t)(long)layout);
+
+ if (local->st_ino) {
+ local->stbuf.st_ino = local->st_ino;
+ } else {
+ gf_log (this->name, GF_LOG_WARNING,
+ "could not find hashed subvolume for %s",
+ local->loc.path);
+ }
+ }
+
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr);
+ }
+
+ return 0;
+
+selfheal:
+ ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk,
+ &local->loc, layout);
+
+ return 0;
+}
+
+int
+dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf, dict_t *xattr)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ call_frame_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+ int is_dir = 0;
+ int is_linkfile = 0;
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+
+ if (op_errno != ENOTCONN && op_errno != ENOENT) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+ }
+
+ goto unlock;
+ }
+
+ if (S_IFMT & (stbuf->st_mode ^ local->inode->st_mode)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "mismatching filetypes 0%o v/s 0%o for %s",
+ (stbuf->st_mode & S_IFMT),
+ (local->inode->st_mode & S_IFMT),
+ local->loc.path);
+
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
+
+ goto unlock;
+ }
+
+ layout = dht_layout_get (this, inode);
+
+ is_dir = check_is_dir (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+
+ if (is_linkfile) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "linkfile found in revalidate for %s",
+ local->loc.path);
+ local->layout_mismatch = 1;
+
+ goto unlock;
+ }
+
+ if (is_dir) {
+ ret = dht_layout_dir_mismatch (this, layout,
+ prev->this, &local->loc,
+ xattr);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "mismatching layouts for %s",
+ local->loc.path);
+
+ local->layout_mismatch = 1;
+
+ goto unlock;
+ }
+ }
+
+ dht_stat_merge (this, &local->stbuf, stbuf, prev->this);
+
+ local->op_ret = 0;
+ local->stbuf.st_ino = local->st_ino;
+
+ if (!local->xattr)
+ local->xattr = dict_ref (xattr);
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ if (!S_ISDIR (local->stbuf.st_mode)
+ && (local->hashed_subvol != local->cached_subvol)
+ && (local->stbuf.st_nlink == 1))
+ local->stbuf.st_mode |= S_ISVTX;
+
+ if (local->layout_mismatch) {
+ local->op_ret = -1;
+ local->op_errno = ESTALE;
+ }
+
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr);
+ }
+
+ return 0;
+}
+
+
+int
+dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ xlator_t *cached_subvol = NULL;
+
+ local = frame->local;
+ cached_subvol = local->cached_subvol;
+
+ layout = dht_layout_for_subvol (this, local->cached_subvol);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no pre-set layout for subvolume %s",
+ cached_subvol ? cached_subvol->name : "<nil>");
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
+ goto unwind;
+ }
+
+ inode_ctx_put (local->inode, this, (uint64_t)(long)layout);
+ local->op_ret = 0;
+ if (local->stbuf.st_nlink == 1)
+ local->stbuf.st_mode |= S_ISVTX;
+
+unwind:
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr);
+ return 0;
+}
+
+
+int
+dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ call_frame_t *prev = NULL;
+ int is_linkfile = 0;
+ int is_dir = 0;
+ xlator_t *subvol = NULL;
+ loc_t *loc = NULL;
+ xlator_t *link_subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *cached_subvol = NULL;
+
+ conf = this->private;
+
+ local = frame->local;
+ loc = &local->loc;
+
+ prev = cookie;
+ subvol = prev->this;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ if (op_errno != ENOENT)
+ local->op_errno = op_errno;
+ goto unlock;
+ }
+
+ is_linkfile = check_is_linkfile (inode, buf, xattr);
+ is_dir = check_is_dir (inode, buf, xattr);
+
+ if (is_linkfile) {
+ link_subvol = dht_linkfile_subvol (this, inode, buf,
+ xattr);
+ gf_log (this->name, GF_LOG_DEBUG,
+ "found on %s linkfile %s (-> %s)",
+ subvol->name, loc->path,
+ link_subvol ? link_subvol->name : "''");
+ goto unlock;
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "found on %s file %s",
+ subvol->name, loc->path);
+ }
+
+ if (!local->cached_subvol) {
+ /* found one file */
+ dht_stat_merge (this, &local->stbuf, buf, subvol);
+ local->xattr = dict_ref (xattr);
+ local->cached_subvol = subvol;
+ } else {
+ gf_log (this->name, GF_LOG_WARNING,
+ "multiple subvolumes (%s and %s atleast) have "
+ "file %s", local->cached_subvol->name,
+ subvol->name, local->loc.path);
+ }
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ if (is_linkfile) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "deleting stale linkfile %s on %s",
+ loc->path, subvol->name);
+ dht_linkfile_unlink (frame, this, subvol, loc);
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ hashed_subvol = local->hashed_subvol;
+ cached_subvol = local->cached_subvol;
+
+ if (!cached_subvol) {
+ DHT_STACK_UNWIND (frame, -1, ENOENT, NULL, NULL, NULL);
+ return 0;
+ }
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "linking file %s existing on %s to %s (hash)",
+ loc->path, cached_subvol->name, hashed_subvol->name);
+
+ dht_linkfile_create (frame, dht_lookup_linkfile_create_cbk,
+ cached_subvol, hashed_subvol, loc);
+ }
+
+ return 0;
+}
+
+
+int
+dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int i = 0;
+ int call_cnt = 0;
+
+ conf = this->private;
+ local = frame->local;
+
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+
+ if (!local->inode)
+ local->inode = inode_ref (loc->inode);
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND (frame, dht_lookup_everywhere_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup,
+ loc, local->xattr_req);
+ }
+
+ return 0;
+}
+
+
+int
+dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf, dict_t *xattr)
+{
+ call_frame_t *prev = NULL;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ xlator_t *subvol = NULL;
+ loc_t *loc = NULL;
+
+ prev = cookie;
+ subvol = prev->this;
+
+ local = frame->local;
+ loc = &local->loc;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "lookup of %s on %s (following linkfile) failed (%s)",
+ local->loc.path, subvol->name, strerror (op_errno));
+
+ dht_lookup_everywhere (frame, this, loc);
+ return 0;
+ }
+
+ /* TODO: assert type is non-dir and non-linkfile */
+
+ if (stbuf->st_nlink == 1)
+ stbuf->st_mode |= S_ISVTX;
+ dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino);
+
+ layout = dht_layout_for_subvol (this, prev->this);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no pre-set layout for subvolume %s",
+ prev->this->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ inode_ctx_put (inode, this, (uint64_t)(long)layout);
+
+out:
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr);
+
+ return 0;
+}
+
+
+int
+dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf, dict_t *xattr)
+{
+ dht_layout_t *layout = NULL;
+ char is_linkfile = 0;
+ char is_dir = 0;
+ xlator_t *subvol = NULL;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ loc_t *loc = NULL;
+ int i = 0;
+ call_frame_t *prev = NULL;
+ int call_cnt = 0;
+
+
+ conf = this->private;
+
+ prev = cookie;
+ local = frame->local;
+ loc = &local->loc;
+
+ if (ENTRY_MISSING (op_ret, op_errno)) {
+ if (conf->search_unhashed) {
+ local->op_errno = ENOENT;
+ dht_lookup_everywhere (frame, this, loc);
+ return 0;
+ }
+ }
+
+ if (op_ret == 0) {
+ is_dir = check_is_dir (inode, stbuf, xattr);
+ if (is_dir) {
+ local->inode = inode_ref (inode);
+ local->xattr = dict_ref (xattr);
+ }
+ }
+
+ if (is_dir || (op_ret == -1 && op_errno == ENOTCONN)) {
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+
+ local->layout = dht_layout_new (this, conf->subvolume_cnt);
+ if (!local->layout) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto out;
+ }
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND (frame, dht_lookup_dir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup,
+ &local->loc, local->xattr_req);
+ }
+ return 0;
+ }
+
+ if (op_ret == -1)
+ goto out;
+
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_dir = check_is_dir (inode, stbuf, xattr);
+
+ if (!is_dir && !is_linkfile) {
+ /* non-directory and not a linkfile */
+
+ dht_itransform (this, prev->this, stbuf->st_ino,
+ &stbuf->st_ino);
+
+ layout = dht_layout_for_subvol (this, prev->this);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no pre-set layout for subvolume %s",
+ prev->this->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ inode_ctx_put (inode, this, (uint64_t)(long)layout);
+ goto out;
+ }
+
+ if (is_linkfile) {
+ subvol = dht_linkfile_subvol (this, inode, stbuf, xattr);
+
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "linkfile not having link subvolume. path=%s",
+ loc->path);
+ dht_lookup_everywhere (frame, this, loc);
+ return 0;
+ }
+
+ STACK_WIND (frame, dht_lookup_linkfile_cbk,
+ subvol, subvol->fops->lookup,
+ &local->loc, local->xattr_req);
+ }
+
+ return 0;
+
+out:
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr);
+ return 0;
+}
+
+
+int
+dht_lookup (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *xattr_req)
+{
+ xlator_t *subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *cached_subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int op_errno = -1;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int call_cnt = 0;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ ret = loc_dup (loc, &local->loc);
+ if (ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "copying location failed for path=%s",
+ loc->path);
+ goto err;
+ }
+
+ if (xattr_req) {
+ local->xattr_req = dict_ref (xattr_req);
+ } else {
+ local->xattr_req = dict_new ();
+ }
+
+ hashed_subvol = dht_subvol_get_hashed (this, loc);
+ cached_subvol = dht_subvol_get_cached (this, loc->inode);
+
+ local->cached_subvol = cached_subvol;
+ local->hashed_subvol = hashed_subvol;
+
+ if (is_revalidate (loc)) {
+ layout = dht_layout_get (this, loc->inode);
+
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "revalidate without cache. path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (layout->gen && (layout->gen < conf->gen)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "incomplete layout failure for path=%s",
+ loc->path);
+ op_errno = EAGAIN;
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->st_ino = loc->inode->ino;
+
+ local->call_cnt = layout->cnt;
+ call_cnt = local->call_cnt;
+
+ /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute,
+ * revalidates directly go to the cached-subvolume.
+ */
+ ret = dict_set_uint32 (local->xattr_req,
+ "trusted.glusterfs.dht", 4 * 4);
+
+ for (i = 0; i < layout->cnt; i++) {
+ subvol = layout->list[i].xlator;
+
+ STACK_WIND (frame, dht_revalidate_cbk,
+ subvol, subvol->fops->lookup,
+ loc, local->xattr_req);
+
+ if (!--call_cnt)
+ break;
+ }
+ } else {
+ /* TODO: remove the hard-coding */
+ ret = dict_set_uint32 (local->xattr_req,
+ "trusted.glusterfs.dht", 4 * 4);
+
+ ret = dict_set_uint32 (local->xattr_req,
+ "trusted.glusterfs.dht.linkto", 256);
+
+ if (!hashed_subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s, "
+ "checking on all the subvols to see if "
+ "it is a directory", loc->path);
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+
+ local->layout = dht_layout_new (this, conf->subvolume_cnt);
+ if (!local->layout) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND (frame, dht_lookup_dir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup,
+ &local->loc, local->xattr_req);
+ }
+ return 0;
+ }
+
+ STACK_WIND (frame, dht_lookup_cbk,
+ hashed_subvol, hashed_subvol->fops->lookup,
+ loc, local->xattr_req);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+
+int
+dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ call_frame_t *prev = NULL;
+
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+ goto unlock;
+ }
+
+ dht_stat_merge (this, &local->stbuf, stbuf, prev->this);
+
+ if (local->inode)
+ local->stbuf.st_ino = local->inode->ino;
+ local->op_ret = 0;
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt))
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->stbuf);
+
+ return 0;
+}
+
+
+int
+dht_stat (call_frame_t *frame, xlator_t *this,
+ loc_t *loc)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ layout = dht_layout_get (this, loc->inode);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->call_cnt = layout->cnt;
+
+ for (i = 0; i < layout->cnt; i++) {
+ subvol = layout->list[i].xlator;
+
+ STACK_WIND (frame, dht_attr_cbk,
+ subvol, subvol->fops->stat,
+ loc);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_fstat (call_frame_t *frame, xlator_t *this,
+ fd_t *fd)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ layout = dht_layout_get (this, fd->inode);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no layout for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "local allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (fd->inode);
+ local->call_cnt = layout->cnt;;
+
+ for (i = 0; i < layout->cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND (frame, dht_attr_cbk,
+ subvol, subvol->fops->fstat,
+ fd);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_chmod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode)
+{
+ dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int i = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ layout = dht_layout_get (this, loc->inode);
+
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (!layout_is_sane (layout)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "layout is not sane for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->call_cnt = layout->cnt;
+
+ for (i = 0; i < layout->cnt; i++) {
+ STACK_WIND (frame, dht_attr_cbk,
+ layout->list[i].xlator,
+ layout->list[i].xlator->fops->chmod,
+ loc, mode);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_chown (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, uid_t uid, gid_t gid)
+{
+ dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int i = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ layout = dht_layout_get (this, loc->inode);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (!layout_is_sane (layout)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "layout is not sane for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->call_cnt = layout->cnt;
+
+ for (i = 0; i < layout->cnt; i++) {
+ STACK_WIND (frame, dht_attr_cbk,
+ layout->list[i].xlator,
+ layout->list[i].xlator->fops->chown,
+ loc, uid, gid);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_fchmod (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, mode_t mode)
+{
+ dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int i = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+
+ layout = dht_layout_get (this, fd->inode);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no layout for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (!layout_is_sane (layout)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "layout is not sane for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (fd->inode);
+ local->call_cnt = layout->cnt;
+
+ for (i = 0; i < layout->cnt; i++) {
+ STACK_WIND (frame, dht_attr_cbk,
+ layout->list[i].xlator,
+ layout->list[i].xlator->fops->fchmod,
+ fd, mode);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_fchown (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, uid_t uid, gid_t gid)
+{
+ dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int i = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ layout = dht_layout_get (this, fd->inode);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no layout for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (!layout_is_sane (layout)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "layout is not sane for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (fd->inode);
+ local->call_cnt = layout->cnt;
+
+ for (i = 0; i < layout->cnt; i++) {
+ STACK_WIND (frame, dht_attr_cbk,
+ layout->list[i].xlator,
+ layout->list[i].xlator->fops->fchown,
+ fd, uid, gid);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_utimens (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, struct timespec tv[2])
+{
+ dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int i = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ layout = dht_layout_get (this, loc->inode);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (!layout_is_sane (layout)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "layout is not sane for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->call_cnt = layout->cnt;
+
+ for (i = 0; i < layout->cnt; i++) {
+ STACK_WIND (frame, dht_attr_cbk,
+ layout->list[i].xlator,
+ layout->list[i].xlator->fops->utimens,
+ loc, tv);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_truncate (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, off_t offset)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_attr_cbk,
+ subvol, subvol->fops->truncate,
+ loc, offset);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_ftruncate (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, off_t offset)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (fd->inode);
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_attr_cbk,
+ subvol, subvol->fops->ftruncate,
+ fd, offset);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ call_frame_t *prev = NULL;
+
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+ goto unlock;
+ }
+
+ local->op_ret = 0;
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt))
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+dht_access (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t mask)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_err_cbk,
+ subvol, subvol->fops->access,
+ loc, mask);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+int
+dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, const char *path)
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, path);
+
+ return 0;
+}
+
+
+int
+dht_readlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, size_t size)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_readlink_cbk,
+ subvol, subvol->fops->readlink,
+ loc, size);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr)
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, xattr);
+
+ return 0;
+}
+
+
+int
+dht_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *key)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_getxattr_cbk,
+ subvol, subvol->fops->getxattr,
+ loc, key);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_setxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *xattr, int flags)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_err_cbk,
+ subvol, subvol->fops->setxattr,
+ loc, xattr, flags);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *key)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_err_cbk,
+ subvol, subvol->fops->removexattr,
+ loc, key);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ call_frame_t *prev = NULL;
+
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+ goto unlock;
+ }
+
+ local->op_ret = 0;
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt))
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->fd);
+
+ return 0;
+}
+
+
+int
+dht_open (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int flags, fd_t *fd)
+{
+ xlator_t *subvol = NULL;
+ int ret = -1;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->fd = fd_ref (fd);
+ ret = loc_dup (loc, &local->loc);
+ if (ret == -1) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_fd_cbk,
+ subvol, subvol->fops->open,
+ loc, flags, fd);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ struct iovec *vector, int count, struct stat *stbuf)
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf);
+
+ return 0;
+}
+
+
+int
+dht_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t off)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_readv_cbk,
+ subvol, subvol->fops->readv,
+ fd, size, off);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL);
+
+ return 0;
+}
+
+
+int
+dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct stat *stbuf)
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, stbuf);
+
+ return 0;
+}
+
+
+int
+dht_writev (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, struct iovec *vector, int count, off_t off)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_writev_cbk,
+ subvol, subvol->fops->writev,
+ fd, vector, count, off);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, 0);
+
+ return 0;
+}
+
+
+int
+dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->fd = fd_ref (fd);
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_err_cbk,
+ subvol, subvol->fops->flush, fd);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+int
+dht_fsync (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int datasync)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocatoin failed :(");
+ goto err;
+ }
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_err_cbk,
+ subvol, subvol->fops->fsync,
+ fd, datasync);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+int
+dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct flock *flock)
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, flock);
+
+ return 0;
+}
+
+
+int
+dht_lk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int cmd, struct flock *flock)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_lk_cbk,
+ subvol, subvol->fops->lk,
+ fd, cmd, flock);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+/* gf_lk no longer exists
+int
+dht_gf_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct flock *flock)
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, flock);
+
+ return 0;
+}
+
+
+int
+dht_gf_lk (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int cmd, struct flock *flock)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_gf_lk_cbk,
+ subvol, subvol->fops->gf_lk,
+ fd, cmd, flock);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+*/
+
+int
+dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct statvfs *statvfs)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto unlock;
+ }
+ local->op_ret = 0;
+
+ /* TODO: normalize sizes */
+ local->statvfs.f_bsize = statvfs->f_bsize;
+ local->statvfs.f_frsize = statvfs->f_frsize;
+
+ local->statvfs.f_blocks += statvfs->f_blocks;
+ local->statvfs.f_bfree += statvfs->f_bfree;
+ local->statvfs.f_bavail += statvfs->f_bavail;
+ local->statvfs.f_files += statvfs->f_files;
+ local->statvfs.f_ffree += statvfs->f_ffree;
+ local->statvfs.f_favail += statvfs->f_favail;
+ local->statvfs.f_fsid = statvfs->f_fsid;
+ local->statvfs.f_flag = statvfs->f_flag;
+ local->statvfs.f_namemax = statvfs->f_namemax;
+
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt))
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->statvfs);
+
+ return 0;
+}
+
+
+int
+dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame);
+ local->call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND (frame, dht_statfs_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->statfs, loc);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int op_errno = -1;
+ int i = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->fd = fd_ref (fd);
+ ret = loc_dup (loc, &local->loc);
+ if (ret == -1) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND (frame, dht_fd_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->opendir,
+ loc, fd);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *orig_entries)
+{
+ dht_local_t *local = NULL;
+ gf_dirent_t entries;
+ gf_dirent_t *orig_entry = NULL;
+ gf_dirent_t *entry = NULL;
+ call_frame_t *prev = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *next = NULL;
+ dht_layout_t *layout = NULL;
+ int count = 0;
+
+
+ INIT_LIST_HEAD (&entries.list);
+ prev = cookie;
+ local = frame->local;
+
+ if (op_ret < 0)
+ goto done;
+
+ layout = dht_layout_get (this, local->fd->inode);
+
+ list_for_each_entry (orig_entry, &orig_entries->list, list) {
+ subvol = dht_layout_search (this, layout, orig_entry->d_name);
+
+ if (!subvol || subvol == prev->this) {
+ entry = gf_dirent_for_name (orig_entry->d_name);
+ if (!entry) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto unwind;
+ }
+
+ dht_itransform (this, subvol, orig_entry->d_ino,
+ &entry->d_ino);
+ dht_itransform (this, subvol, orig_entry->d_off,
+ &entry->d_off);
+
+ entry->d_type = orig_entry->d_type;
+ entry->d_len = orig_entry->d_len;
+
+ list_add_tail (&entry->list, &entries.list);
+ count++;
+ }
+ }
+ op_ret = count;
+
+done:
+ if (count == 0) {
+ next = dht_subvol_next (this, prev->this);
+ if (!next) {
+ goto unwind;
+ }
+
+ STACK_WIND (frame, dht_readdir_cbk,
+ next, next->fops->readdir,
+ local->fd, local->size, 0);
+ return 0;
+ }
+
+unwind:
+ if (op_ret < 0)
+ op_ret = 0;
+
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, &entries);
+
+ gf_dirent_free (&entries);
+
+ return 0;
+}
+
+
+int
+dht_readdir (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t yoff)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ xlator_t *xvol = NULL;
+ off_t xoff = 0;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->fd = fd_ref (fd);
+ local->size = size;
+
+ dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff);
+
+ /* TODO: do proper readdir */
+ STACK_WIND (frame, dht_readdir_cbk,
+ xvol, xvol->fops->readdir,
+ fd, size, xoff);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1)
+ local->op_errno = op_errno;
+
+ if (op_ret == 0)
+ local->op_ret = 0;
+ }
+ UNLOCK (&frame->lock);
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt))
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->fd = fd_ref (fd);
+ local->call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND (frame, dht_fsyncdir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->fsyncdir,
+ fd, datasync);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+int
+dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ call_frame_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+
+
+ if (op_ret == -1)
+ goto out;
+
+ prev = cookie;
+
+ dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino);
+ layout = dht_layout_for_subvol (this, prev->this);
+
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no pre-set layout for subvolume %s",
+ prev->this->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = inode_ctx_put (inode, this, (uint64_t)(long)layout);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not set inode context");
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+out:
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf);
+ return 0;
+}
+
+
+int
+dht_mknod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, dev_t rdev)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+
+ subvol = dht_subvol_get_hashed (this, loc);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "creating %s on %s", loc->path, subvol->name);
+
+ STACK_WIND (frame, dht_newfile_cbk,
+ subvol, subvol->fops->mknod,
+ loc, mode, rdev);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int
+dht_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkname, loc_t *loc)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+
+ subvol = dht_subvol_get_hashed (this, loc);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "creating %s on %s", loc->path, subvol->name);
+
+ STACK_WIND (frame, dht_newfile_cbk,
+ subvol, subvol->fops->symlink,
+ linkname, loc);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int
+dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ xlator_t *cached_subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+
+ cached_subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!cached_subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ hashed_subvol = dht_subvol_get_hashed (this, loc);
+ if (!hashed_subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->call_cnt = 1;
+ if (hashed_subvol != cached_subvol)
+ local->call_cnt++;
+
+ STACK_WIND (frame, dht_err_cbk,
+ cached_subvol, cached_subvol->fops->unlink, loc);
+
+ if (hashed_subvol != cached_subvol)
+ STACK_WIND (frame, dht_err_cbk,
+ hashed_subvol, hashed_subvol->fops->unlink, loc);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+int
+dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ call_frame_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+
+ prev = cookie;
+ local = frame->local;
+
+ if (op_ret == -1)
+ goto out;
+
+ layout = dht_layout_for_subvol (this, prev->this);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no pre-set layout for subvolume %s",
+ prev->this->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ stbuf->st_ino = local->loc.inode->ino;
+
+out:
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf);
+
+ return 0;
+}
+
+
+int
+dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ xlator_t *srcvol = NULL;
+
+
+ if (op_ret == -1)
+ goto err;
+
+ local = frame->local;
+ srcvol = local->linkfile.srcvol;
+
+ STACK_WIND (frame, dht_link_cbk,
+ srcvol, srcvol->fops->link,
+ &local->loc, &local->loc2);
+
+ return 0;
+
+err:
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf);
+
+ return 0;
+}
+
+
+int
+dht_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc)
+{
+ xlator_t *cached_subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ int op_errno = -1;
+ int ret = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (oldloc, err);
+ VALIDATE_OR_GOTO (newloc, err);
+
+ cached_subvol = dht_subvol_get_cached (this, oldloc->inode);
+ if (!cached_subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", oldloc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ hashed_subvol = dht_subvol_get_hashed (this, newloc);
+ if (!hashed_subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ newloc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ ret = loc_copy (&local->loc, oldloc);
+ if (ret == -1) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ ret = loc_copy (&local->loc2, newloc);
+ if (ret == -1) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ if (hashed_subvol != cached_subvol) {
+ dht_linkfile_create (frame, dht_link_linkfile_cbk,
+ cached_subvol, hashed_subvol, newloc);
+ } else {
+ STACK_WIND (frame, dht_link_cbk,
+ cached_subvol, cached_subvol->fops->link,
+ oldloc, newloc);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int
+dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ fd_t *fd, inode_t *inode, struct stat *stbuf)
+{
+ call_frame_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+
+
+ if (op_ret == -1)
+ goto out;
+
+ prev = cookie;
+
+ dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino);
+ layout = dht_layout_for_subvol (this, prev->this);
+
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no pre-set layout for subvolume %s",
+ prev->this->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = inode_ctx_put (inode, this, (uint64_t)(long)layout);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not set inode context");
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+out:
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, fd, inode, stbuf);
+ return 0;
+}
+
+
+int
+dht_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode, fd_t *fd)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+
+ subvol = dht_subvol_get_hashed (this, loc);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "creating %s on %s", loc->path, subvol->name);
+
+ STACK_WIND (frame, dht_create_cbk,
+ subvol, subvol->fops->create,
+ loc, flags, mode, fd);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+int
+dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+
+
+ local = frame->local;
+ layout = local->selfheal.layout;
+
+ if (op_ret == 0) {
+ inode_ctx_put (local->inode, this, (uint64_t)(long)layout);
+ local->selfheal.layout = NULL;
+ local->stbuf.st_ino = local->st_ino;
+ }
+
+ DHT_STACK_UNWIND (frame, op_ret, op_errno,
+ local->inode, &local->stbuf);
+
+ return 0;
+}
+
+
+int
+dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ int ret = -1;
+ call_frame_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ layout = local->layout;
+
+ LOCK (&frame->lock);
+ {
+ ret = dht_layout_merge (this, layout, prev->this,
+ op_ret, op_errno, NULL);
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto unlock;
+ }
+ dht_stat_merge (this, &local->stbuf, stbuf, prev->this);
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ local->layout = NULL;
+ dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk,
+ &local->loc, layout);
+ }
+
+ return 0;
+}
+
+int
+dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ int ret = -1;
+ call_frame_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *hashed_subvol = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ layout = local->layout;
+ conf = this->private;
+ hashed_subvol = local->hashed_subvol;
+
+ ret = dht_layout_merge (this, layout, prev->this,
+ op_ret, op_errno, NULL);
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto err;
+ }
+ local->op_ret = 0;
+
+ dht_stat_merge (this, &local->stbuf, stbuf, prev->this);
+
+ local->st_ino = local->stbuf.st_ino;
+
+ local->call_cnt = conf->subvolume_cnt - 1;
+
+ if (local->call_cnt == 0) {
+ local->layout = NULL;
+ dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk,
+ &local->loc, layout);
+ }
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == hashed_subvol)
+ continue;
+ STACK_WIND (frame, dht_mkdir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->mkdir,
+ &local->loc, local->mode);
+ }
+ return 0;
+err:
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
+
+int
+dht_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int ret = -1;
+ xlator_t *hashed_subvol = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ hashed_subvol = dht_subvol_get_hashed (this, loc);
+
+ if (hashed_subvol == NULL) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "hashed subvol not found");
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->hashed_subvol = hashed_subvol;
+ local->inode = inode_ref (loc->inode);
+ ret = loc_copy (&local->loc, loc);
+ local->mode = mode;
+
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->layout = dht_layout_new (this, conf->subvolume_cnt);
+ if (!local->layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_mkdir_hashed_cbk,
+ hashed_subvol,
+ hashed_subvol->fops->mkdir,
+ loc, mode);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int
+dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+ local->layout = NULL;
+
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno)
+{
+ uint64_t tmp_layout = 0;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ call_frame_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+
+ if (op_errno != ENOENT)
+ local->need_selfheal = 1;
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "rmdir on %s for %s failed (%s)",
+ prev->this->name, local->loc.path,
+ strerror (op_errno));
+ goto unlock;
+ }
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ if (local->need_selfheal) {
+ inode_ctx_get (local->loc.inode, this,
+ &tmp_layout);
+ layout = (dht_layout_t *)(long)tmp_layout;
+
+ /* TODO: neater interface needed below */
+ local->stbuf.st_mode = local->loc.inode->st_mode;
+
+ dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk,
+ &local->loc, layout);
+ } else {
+ DHT_STACK_UNWIND (frame, local->op_ret,
+ local->op_errno);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+dht_rmdir_do (call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+
+ conf = this->private;
+ local = frame->local;
+
+ if (local->op_ret == -1)
+ goto err;
+
+ local->call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND (frame, dht_rmdir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->rmdir,
+ &local->loc);
+ }
+
+ return 0;
+
+err:
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+ return 0;
+}
+
+
+int
+dht_rmdir_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *entries)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+ call_frame_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret > 2) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir on %s for %s returned %d entries",
+ prev->this->name, local->loc.path, op_ret);
+ local->op_ret = -1;
+ local->op_errno = ENOTEMPTY;
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ dht_rmdir_do (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+ call_frame_t *prev = NULL;
+
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "opendir on %s for %s failed (%s)",
+ prev->this->name, local->loc.path,
+ strerror (op_errno));
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_rmdir_readdir_cbk,
+ prev->this, prev->this->fops->readdir,
+ local->fd, 4096, 0);
+
+ return 0;
+
+err:
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ dht_rmdir_do (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
+ int ret = -1;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->call_cnt = conf->subvolume_cnt;
+ local->op_ret = 0;
+
+ ret = loc_copy (&local->loc, loc);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->fd = fd_create (local->loc.inode, frame->root->pid);
+ if (!local->fd) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND (frame, dht_rmdir_opendir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->opendir,
+ loc, local->fd);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+static int32_t
+dht_xattrop_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict)
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, dict);
+ return 0;
+}
+
+int32_t
+dht_xattrop (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ gf_xattrop_flags_t flags,
+ dict_t *dict)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->call_cnt = 1;
+
+ STACK_WIND (frame,
+ dht_xattrop_cbk,
+ subvol, subvol->fops->xattrop,
+ loc, flags, dict);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+static int32_t
+dht_fxattrop_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict)
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, dict);
+ return 0;
+}
+
+int32_t
+dht_fxattrop (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ gf_xattrop_flags_t flags,
+ dict_t *dict)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ dht_fxattrop_cbk,
+ subvol, subvol->fops->fxattrop,
+ fd, flags, dict);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+static int32_t
+dht_inodelk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+
+int32_t
+dht_inodelk (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t cmd, struct flock *lock)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->call_cnt = 1;
+
+ STACK_WIND (frame,
+ dht_inodelk_cbk,
+ subvol, subvol->fops->inodelk,
+ loc, cmd, lock);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+static int32_t
+dht_finodelk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+
+int32_t
+dht_finodelk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t cmd, struct flock *lock)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+
+ STACK_WIND (frame,
+ dht_finodelk_cbk,
+ subvol, subvol->fops->finodelk,
+ fd, cmd, lock);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+static int32_t
+dht_entrylk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+int32_t
+dht_entrylk (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *basename,
+ entrylk_cmd cmd, entrylk_type type)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = dht_subvol_get_cached (this, loc->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->call_cnt = 1;
+
+ STACK_WIND (frame, dht_entrylk_cbk,
+ subvol, subvol->fops->entrylk,
+ loc, basename, cmd, type);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+static int32_t
+dht_fentrylk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ DHT_STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+int32_t
+dht_fentrylk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *basename,
+ entrylk_cmd cmd, entrylk_type type)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ subvol = dht_subvol_get_cached (this, fd->inode);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_fentrylk_cbk,
+ subvol, subvol->fops->fentrylk,
+ fd, basename, cmd, type);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno);
+
+ return 0;
+}
+
+
+int
+dht_forget (xlator_t *this, inode_t *inode)
+{
+ uint64_t tmp_layout = 0;
+ dht_layout_t *layout = NULL;
+
+ inode_ctx_get (inode, this, &tmp_layout);
+
+ if (!layout)
+ return 0;
+ layout = (dht_layout_t *)(long)tmp_layout;
+ if (!layout->preset)
+ FREE (layout);
+
+ return 0;
+}
+
+
+
+static int
+dht_init_subvolumes (xlator_t *this, dht_conf_t *conf)
+{
+ xlator_list_t *subvols = NULL;
+ int cnt = 0;
+
+
+ for (subvols = this->children; subvols; subvols = subvols->next)
+ cnt++;
+
+ conf->subvolumes = CALLOC (cnt, sizeof (xlator_t *));
+ if (!conf->subvolumes) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ return -1;
+ }
+ conf->subvolume_cnt = cnt;
+
+ cnt = 0;
+ for (subvols = this->children; subvols; subvols = subvols->next)
+ conf->subvolumes[cnt++] = subvols->xlator;
+
+ conf->subvolume_status = CALLOC (cnt, sizeof (char));
+ if (!conf->subvolume_status) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ return -1;
+ }
+
+ return 0;
+}
+
+
+int
+dht_notify (xlator_t *this, int event, void *data, ...)
+{
+ xlator_t *subvol = NULL;
+ int cnt = -1;
+ int i = -1;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+
+
+ conf = this->private;
+
+ switch (event) {
+ case GF_EVENT_CHILD_UP:
+ subvol = data;
+
+ conf->gen++;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ cnt = i;
+ break;
+ }
+ }
+
+ if (cnt == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "got GF_EVENT_CHILD_UP bad subvolume %s",
+ subvol->name);
+ break;
+ }
+
+ LOCK (&conf->subvolume_lock);
+ {
+ conf->subvolume_status[cnt] = 1;
+ }
+ UNLOCK (&conf->subvolume_lock);
+
+ break;
+
+ case GF_EVENT_CHILD_DOWN:
+ subvol = data;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ cnt = i;
+ break;
+ }
+ }
+
+ if (cnt == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "got GF_EVENT_CHILD_DOWN bad subvolume %s",
+ subvol->name);
+ break;
+ }
+
+ LOCK (&conf->subvolume_lock);
+ {
+ conf->subvolume_status[cnt] = 0;
+ }
+ UNLOCK (&conf->subvolume_lock);
+
+ break;
+ }
+
+ ret = default_notify (this, event, data);
+
+ return ret;
+}
+
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
new file mode 100644
index 000000000..17017381b
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -0,0 +1,212 @@
+/*
+ Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef _DHT_H
+#define _DHT_H
+
+
+typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno);
+
+
+struct dht_layout {
+ int cnt;
+ int preset;
+ int gen;
+ int type;
+ struct {
+ int err; /* 0 = normal
+ -1 = dir exists and no xattr
+ >0 = dir lookup failed with errno
+ */
+ uint32_t start;
+ uint32_t stop;
+ xlator_t *xlator;
+ } list[0];
+};
+typedef struct dht_layout dht_layout_t;
+
+
+struct dht_local {
+ int call_cnt;
+ loc_t loc;
+ loc_t loc2;
+ int op_ret;
+ int op_errno;
+ int layout_mismatch;
+ struct stat stbuf;
+ struct statvfs statvfs;
+ fd_t *fd;
+ inode_t *inode;
+ dict_t *xattr;
+ dict_t *xattr_req;
+ dht_layout_t *layout;
+ size_t size;
+ ino_t st_ino;
+ xlator_t *src_hashed, *src_cached;
+ xlator_t *dst_hashed, *dst_cached;
+ xlator_t *cached_subvol;
+ xlator_t *hashed_subvol;
+ char need_selfheal;
+ struct {
+ fop_mknod_cbk_t linkfile_cbk;
+ struct stat stbuf;
+ loc_t loc;
+ inode_t *inode;
+ dict_t *xattr;
+ xlator_t *srcvol;
+ } linkfile;
+ struct {
+ uint32_t hole_cnt;
+ uint32_t overlaps_cnt;
+ uint32_t missing;
+ uint32_t down;
+ uint32_t misc;
+ dht_selfheal_dir_cbk_t dir_cbk;
+ dht_layout_t *layout;
+ } selfheal;
+
+ /* needed by nufa */
+ int32_t flags;
+ mode_t mode;
+ dev_t rdev;
+};
+typedef struct dht_local dht_local_t;
+
+
+struct dht_conf {
+ gf_lock_t subvolume_lock;
+ int subvolume_cnt;
+ xlator_t **subvolumes;
+ xlator_t *local_volume; /* Needed by NUFA */
+ char *subvolume_status;
+ dht_layout_t **file_layouts;
+ dht_layout_t **dir_layouts;
+ dht_layout_t *default_dir_layout;
+ gf_boolean_t search_unhashed;
+ int gen;
+};
+typedef struct dht_conf dht_conf_t;
+
+
+struct dht_disk_layout {
+ uint32_t cnt;
+ uint32_t type;
+ struct {
+ uint32_t start;
+ uint32_t stop;
+ } list[1];
+};
+typedef struct dht_disk_layout dht_disk_layout_t;
+
+#define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT)
+
+#define is_fs_root(loc) (strcmp (loc->path, "/") == 0)
+
+#define is_revalidate(loc) (inode_ctx_get (loc->inode, this, NULL) == 0)
+
+#define is_last_call(cnt) (cnt == 0)
+
+#define DHT_LINKFILE_MODE (S_ISVTX)
+#define check_is_linkfile(i,s,x) ((s->st_mode & ~S_IFMT) == DHT_LINKFILE_MODE)
+
+#define check_is_dir(i,s,x) (S_ISDIR(s->st_mode))
+
+#define layout_is_sane(layout) ((layout) && (layout->cnt > 0))
+
+#define DHT_STACK_UNWIND(frame, params ...) do { \
+ dht_local_t *__local = NULL; \
+ __local = frame->local; \
+ frame->local = NULL; \
+ STACK_UNWIND (frame, params); \
+ dht_local_wipe (__local); \
+ } while (0)
+
+#define DHT_STACK_DESTROY(frame) do { \
+ dht_local_t *__local = NULL; \
+ __local = frame->local; \
+ frame->local = NULL; \
+ STACK_DESTROY (frame->root); \
+ dht_local_wipe (__local); \
+ } while (0)
+
+dht_layout_t *dht_layout_new (xlator_t *this, int cnt);
+dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode);
+dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol);
+xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout,
+ const char *name);
+int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout);
+int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
+ uint32_t *holes_p, uint32_t *overlaps_p,
+ uint32_t *missing_p, uint32_t *down_p,
+ uint32_t *misc_p);
+int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout,
+ xlator_t *subvol, loc_t *loc, dict_t *xattr);
+
+xlator_t *dht_linkfile_subvol (xlator_t *this, inode_t *inode,
+ struct stat *buf, dict_t *xattr);
+int dht_linkfile_unlink (call_frame_t *frame, xlator_t *this,
+ xlator_t *subvol, loc_t *loc);
+
+int dht_layouts_init (xlator_t *this, dht_conf_t *conf);
+int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+ int op_ret, int op_errno, dict_t *xattr);
+
+int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
+ int pos, int32_t **disk_layout_p);
+int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
+ int pos, int32_t *disk_layout);
+
+
+int dht_frame_return (call_frame_t *frame);
+
+int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y);
+int dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol,
+ uint64_t *x);
+
+void dht_local_wipe (dht_local_t *local);
+dht_local_t *dht_local_init (call_frame_t *frame);
+int dht_stat_merge (xlator_t *this, struct stat *to, struct stat *from,
+ xlator_t *subvol);
+
+xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc);
+xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode);
+xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev);
+int dht_subvol_cnt (xlator_t *this, xlator_t *subvol);
+
+int dht_hash_compute (int type, const char *name, uint32_t *hash_p);
+
+int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
+ xlator_t *tovol, xlator_t *fromvol, loc_t *loc);
+int
+dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
+ loc_t *loc, dht_layout_t *layout);
+int
+dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
+ loc_t *loc, dht_layout_t *layout);
+
+int dht_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc);
+#endif /* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-hashfn-tea.c b/xlators/cluster/dht/src/dht-hashfn-tea.c
new file mode 100644
index 000000000..8437b4955
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-hashfn-tea.c
@@ -0,0 +1,146 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#define DELTA 0x9E3779B9
+#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
+#define PARTROUNDS 6 /* 6 gets complete mixing */
+
+
+static int
+tearound (int rounds, uint32_t *array, uint32_t *h0, uint32_t *h1)
+{
+ uint32_t sum = 0;
+ int n = 0;
+ uint32_t b0 = 0;
+ uint32_t b1 = 0;
+
+ b0 = *h0;
+ b1 = *h1;
+
+ n = rounds;
+
+ do {
+ sum += DELTA;
+ b0 += ((b1 << 4) + array[0])
+ ^ (b1 + sum)
+ ^ ((b1 >> 5) + array[1]);
+ b1 += ((b0 << 4) + array[2])
+ ^ (b0 + sum)
+ ^ ((b0 >> 5) + array[3]);
+ } while (--n);
+
+ *h0 += b0;
+ *h1 += b1;
+
+ return 0;
+}
+
+
+uint32_t
+__pad (int len)
+{
+ uint32_t pad = 0;
+
+ pad = (uint32_t) len | ((uint32_t) len << 8);
+ pad |= pad << 16;
+
+ return pad;
+}
+
+
+uint32_t
+dht_hashfn_tea (const char *msg, int len)
+{
+ uint32_t h0 = 0x9464a485;
+ uint32_t h1 = 0x542e1a94;
+ uint32_t array[4];
+ uint32_t pad = 0;
+ int i = 0;
+ int j = 0;
+ int full_quads = 0;
+ int full_words = 0;
+ int full_bytes = 0;
+ uint32_t *intmsg = NULL;
+ int word = 0;
+
+
+ intmsg = (uint32_t *) msg;
+ pad = __pad (len);
+
+ full_bytes = len;
+ full_words = len / 4;
+ full_quads = len / 16;
+
+ for (i = 0; i < full_quads; i++) {
+ for (j = 0; j < 4; j++) {
+ word = *intmsg;
+ array[j] = word;
+ intmsg++;
+ full_words--;
+ full_bytes -= 4;
+ }
+ tearound (PARTROUNDS, &array[0], &h0, &h1);
+ }
+
+ if ((len % 16) == 0) {
+ goto done;
+ }
+
+ for (j = 0; j < 4; j++) {
+ if (full_words) {
+ word = *intmsg;
+ array[j] = word;
+ intmsg++;
+ full_words--;
+ full_bytes -= 4;
+ } else {
+ array[j] = pad;
+ while (full_bytes) {
+ array[j] <<= 8;
+ array[j] |= msg[len - full_bytes];
+ full_bytes--;
+ }
+ }
+ }
+ tearound (FULLROUNDS, &array[0], &h0, &h1);
+
+done:
+ return h0 ^ h1;
+}
+
+
+#if 0
+int
+main (int argc, char *argv[])
+{
+ int i = 0;
+ int hashval = 0;
+
+ for (i = 1; i < argc; i++) {
+ hashval = tea (argv[i], strlen (argv[i]));
+ printf ("%s: %x\n", argv[i], hashval);
+ }
+}
+#endif
diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c
new file mode 100644
index 000000000..9e321a43c
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-hashfn.c
@@ -0,0 +1,88 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "dht-common.h"
+
+
+uint32_t dht_hashfn_tea (const char *name, int len);
+
+
+typedef enum {
+ DHT_HASH_TYPE_TEA,
+} dht_hashfn_type_t;
+
+
+int
+dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p)
+{
+ int ret = 0;
+ uint32_t hash = 0;
+
+ switch (type) {
+ case DHT_HASH_TYPE_TEA:
+ hash = dht_hashfn_tea (name, strlen (name));
+ break;
+ default:
+ ret = -1;
+ break;
+ }
+
+ if (ret == 0) {
+ *hash_p = hash;
+ }
+
+ return ret;
+}
+
+
+#define MAKE_RSYNC_FRIENDLY_NAME(rsync_frndly_name, name) do { \
+ rsync_frndly_name = (char *) name; \
+ if (name[0] == '.') { \
+ char *dot = 0; \
+ int namelen = 0; \
+ \
+ dot = strrchr (name, '.'); \
+ if (dot && dot > (name + 1) && *(dot + 1)) { \
+ namelen = (dot - name); \
+ rsync_frndly_name = alloca (namelen); \
+ strncpy (rsync_frndly_name, name + 1, \
+ namelen); \
+ rsync_frndly_name[namelen - 1] = 0; \
+ } \
+ } \
+ } while (0);
+
+
+int
+dht_hash_compute (int type, const char *name, uint32_t *hash_p)
+{
+ char *rsync_friendly_name = NULL;
+
+ MAKE_RSYNC_FRIENDLY_NAME (rsync_friendly_name, name);
+
+ return dht_hash_compute_internal (type, rsync_friendly_name, hash_p);
+}
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
new file mode 100644
index 000000000..52d072002
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-helper.c
@@ -0,0 +1,326 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "dht-common.h"
+
+
+int
+dht_frame_return (call_frame_t *frame)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+
+ if (!frame)
+ return -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ this_call_cnt = --local->call_cnt;
+ }
+ UNLOCK (&frame->lock);
+
+ return this_call_cnt;
+}
+
+
+int
+dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p)
+{
+ dht_conf_t *conf = NULL;
+ int cnt = 0;
+ int max = 0;
+ uint64_t y = 0;
+
+
+ if (x == ((uint64_t) -1)) {
+ y = (uint64_t) -1;
+ goto out;
+ }
+
+ conf = this->private;
+
+ max = conf->subvolume_cnt;
+ cnt = dht_subvol_cnt (this, subvol);
+
+ y = ((x * max) + cnt);
+
+out:
+ if (y_p)
+ *y_p = y;
+
+ return 0;
+}
+
+
+int
+dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p,
+ uint64_t *x_p)
+{
+ dht_conf_t *conf = NULL;
+ int cnt = 0;
+ int max = 0;
+ uint64_t x = 0;
+ xlator_t *subvol = 0;
+
+
+ conf = this->private;
+ max = conf->subvolume_cnt;
+
+ cnt = y % max;
+ x = y / max;
+
+ subvol = conf->subvolumes[cnt];
+
+ if (subvol_p)
+ *subvol_p = subvol;
+
+ if (x_p)
+ *x_p = x;
+
+ return 0;
+}
+
+
+void
+dht_local_wipe (dht_local_t *local)
+{
+ if (!local)
+ return;
+
+ loc_wipe (&local->loc);
+ loc_wipe (&local->loc2);
+
+ if (local->xattr)
+ dict_unref (local->xattr);
+
+ if (local->inode)
+ inode_unref (local->inode);
+
+ if (local->layout)
+ FREE (local->layout);
+
+ loc_wipe (&local->linkfile.loc);
+
+ if (local->linkfile.xattr)
+ dict_unref (local->linkfile.xattr);
+
+ if (local->linkfile.inode)
+ inode_unref (local->linkfile.inode);
+
+ if (local->fd) {
+ fd_unref (local->fd);
+ local->fd = NULL;
+ }
+
+ if (local->xattr_req)
+ dict_unref (local->xattr_req);
+
+ FREE (local);
+}
+
+
+dht_local_t *
+dht_local_init (call_frame_t *frame)
+{
+ dht_local_t *local = NULL;
+
+ /* TODO: use mem-pool */
+ local = CALLOC (1, sizeof (*local));
+
+ if (!local)
+ return NULL;
+
+ local->op_ret = -1;
+ local->op_errno = EUCLEAN;
+
+ frame->local = local;
+
+ return local;
+}
+
+
+char *
+basestr (const char *str)
+{
+ char *basestr = NULL;
+
+ basestr = strrchr (str, '/');
+ if (basestr)
+ basestr ++;
+
+ return basestr;
+}
+
+xlator_t *
+dht_first_up_child (xlator_t *this)
+{
+ dht_conf_t *conf = NULL;
+ xlator_t *child = NULL;
+ int i = 0;
+
+ conf = this->private;
+
+ LOCK (&conf->subvolume_lock);
+ {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolume_status[i]) {
+ child = conf->subvolumes[i];
+ break;
+ }
+ }
+ }
+ UNLOCK (&conf->subvolume_lock);
+
+ return child;
+}
+
+xlator_t *
+dht_subvol_get_hashed (xlator_t *this, loc_t *loc)
+{
+ dht_layout_t *layout = NULL;
+ xlator_t *subvol = NULL;
+
+ if (is_fs_root (loc)) {
+ subvol = dht_first_up_child (this);
+ goto out;
+ }
+
+ layout = dht_layout_get (this, loc->parent);
+
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "layout missing path=%s parent=%"PRId64,
+ loc->path, loc->parent->ino);
+ goto out;
+ }
+
+ subvol = dht_layout_search (this, layout, loc->name);
+
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not find subvolume for path=%s",
+ loc->path);
+ goto out;
+ }
+
+out:
+ return subvol;
+}
+
+
+xlator_t *
+dht_subvol_get_cached (xlator_t *this, inode_t *inode)
+{
+ dht_layout_t *layout = NULL;
+ xlator_t *subvol = NULL;
+
+
+ layout = dht_layout_get (this, inode);
+
+ if (!layout) {
+ goto out;
+ }
+
+ subvol = layout->list[0].xlator;
+
+out:
+ return subvol;
+}
+
+
+xlator_t *
+dht_subvol_next (xlator_t *this, xlator_t *prev)
+{
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *next = NULL;
+
+ conf = this->private;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == prev) {
+ if ((i + 1) < conf->subvolume_cnt)
+ next = conf->subvolumes[i + 1];
+ break;
+ }
+ }
+
+ return next;
+}
+
+
+int
+dht_subvol_cnt (xlator_t *this, xlator_t *subvol)
+{
+ int i = 0;
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+
+
+ conf = this->private;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ ret = i;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+
+#define set_if_greater(a, b) do { \
+ if ((a) < (b)) \
+ (a) = (b); \
+ } while (0)
+
+int
+dht_stat_merge (xlator_t *this, struct stat *to,
+ struct stat *from, xlator_t *subvol)
+{
+ to->st_dev = from->st_dev;
+
+ dht_itransform (this, subvol, from->st_ino, &to->st_ino);
+
+ to->st_mode = from->st_mode;
+ to->st_nlink = from->st_nlink;
+ to->st_uid = from->st_uid;
+ to->st_gid = from->st_gid;
+ to->st_rdev = from->st_rdev;
+ to->st_size += from->st_size;
+ to->st_blksize = from->st_blksize;
+ to->st_blocks += from->st_blocks;
+
+ set_if_greater (to->st_atime, from->st_atime);
+ set_if_greater (to->st_mtime, from->st_mtime);
+ set_if_greater (to->st_ctime, from->st_ctime);
+
+ return 0;
+}
diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c
new file mode 100644
index 000000000..08b4a2746
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-layout.c
@@ -0,0 +1,543 @@
+/*
+ Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "dht-common.h"
+#include "byte-order.h"
+
+#define layout_base_size (sizeof (dht_layout_t))
+
+#define layout_entry_size (sizeof ((dht_layout_t *)NULL)->list[0])
+
+#define layout_size(cnt) (layout_base_size + (cnt * layout_entry_size))
+
+
+dht_layout_t *
+dht_layout_new (xlator_t *this, int cnt)
+{
+ dht_layout_t *layout = NULL;
+
+
+ layout = CALLOC (1, layout_size (cnt));
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto out;
+ }
+
+ layout->cnt = cnt;
+
+out:
+ return layout;
+}
+
+
+dht_layout_t *
+dht_layout_get (xlator_t *this, inode_t *inode)
+{
+ uint64_t layout = 0;
+ int ret = -1;
+
+ ret = inode_ctx_get (inode, this, &layout);
+
+ return (dht_layout_t *)(long)layout;
+}
+
+
+xlator_t *
+dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name)
+{
+ uint32_t hash = 0;
+ xlator_t *subvol = NULL;
+ int i = 0;
+ int ret = 0;
+
+
+ ret = dht_hash_compute (layout->type, name, &hash);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "hash computation failed for type=%d name=%s",
+ layout->type, name);
+ goto out;
+ }
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].start <= hash
+ && layout->list[i].stop >= hash) {
+ subvol = layout->list[i].xlator;
+ break;
+ }
+ }
+
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no subvolume for hash (value) = %u", hash);
+ }
+
+out:
+ return subvol;
+}
+
+
+dht_layout_t *
+dht_layout_for_subvol (xlator_t *this, xlator_t *subvol)
+{
+ dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+
+
+ conf = this->private;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == subvol) {
+ layout = conf->file_layouts[i];
+ break;
+ }
+ }
+
+ return layout;
+}
+
+
+int
+dht_layouts_init (xlator_t *this, dht_conf_t *conf)
+{
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int ret = -1;
+
+
+ conf->file_layouts = CALLOC (conf->subvolume_cnt,
+ sizeof (dht_layout_t *));
+ if (!conf->file_layouts) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto out;
+ }
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ layout = dht_layout_new (this, 1);
+
+ if (!layout) {
+ goto out;
+ }
+
+ layout->preset = 1;
+
+ layout->list[0].xlator = conf->subvolumes[i];
+
+ conf->file_layouts[i] = layout;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+int
+dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
+ int pos, int32_t **disk_layout_p)
+{
+ int ret = -1;
+ int32_t *disk_layout = NULL;
+
+ disk_layout = CALLOC (5, sizeof (int));
+ if (!disk_layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto out;
+ }
+
+ disk_layout[0] = hton32 (1);
+ disk_layout[1] = hton32 (layout->type);
+ disk_layout[2] = hton32 (layout->list[pos].start);
+ disk_layout[3] = hton32 (layout->list[pos].stop);
+
+ if (disk_layout_p)
+ *disk_layout_p = disk_layout;
+ ret = 0;
+
+out:
+ return ret;
+}
+
+
+int
+dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
+ int pos, int32_t *disk_layout)
+{
+ int cnt = 0;
+ int type = 0;
+ int start_off = 0;
+ int stop_off = 0;
+
+
+ /* TODO: assert disk_layout_ptr is of required length */
+
+ cnt = ntoh32 (disk_layout[0]);
+ if (cnt != 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "disk layout has invalid count %d", cnt);
+ return -1;
+ }
+
+ /* TODO: assert type is compatible */
+ type = ntoh32 (disk_layout[1]);
+ start_off = ntoh32 (disk_layout[2]);
+ stop_off = ntoh32 (disk_layout[3]);
+
+ layout->list[pos].start = start_off;
+ layout->list[pos].stop = stop_off;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "merged to layout: %u - %u (type %d) from %s",
+ start_off, stop_off, type,
+ layout->list[pos].xlator->name);
+
+ return 0;
+}
+
+
+int
+dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+ int op_ret, int op_errno, dict_t *xattr)
+{
+ int i = 0;
+ int ret = -1;
+ int err = -1;
+ int32_t *disk_layout = NULL;
+
+
+ if (op_ret != 0) {
+ err = op_errno;
+ }
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == NULL) {
+ layout->list[i].err = err;
+ layout->list[i].xlator = subvol;
+ break;
+ }
+ }
+
+ if (op_ret != 0) {
+ ret = 0;
+ goto out;
+ }
+
+ if (xattr) {
+ /* during lookup and not mkdir */
+ ret = dict_get_ptr (xattr, "trusted.glusterfs.dht",
+ VOID(&disk_layout));
+ }
+
+ if (ret != 0) {
+ layout->list[i].err = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "missing disk layout on %s. err = %d",
+ subvol->name, err);
+ ret = 0;
+ goto out;
+ }
+
+ ret = dht_disk_layout_merge (this, layout, i, disk_layout);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "layout merge from subvolume %s failed",
+ subvol->name);
+ goto out;
+ }
+ layout->list[i].err = 0;
+
+out:
+ return ret;
+}
+
+
+void
+dht_layout_entry_swap (dht_layout_t *layout, int i, int j)
+{
+ uint32_t start_swap = 0;
+ uint32_t stop_swap = 0;
+ xlator_t *xlator_swap = 0;
+ int err_swap = 0;
+
+
+ start_swap = layout->list[i].start;
+ stop_swap = layout->list[i].stop;
+ xlator_swap = layout->list[i].xlator;
+ err_swap = layout->list[i].err;
+
+ layout->list[i].start = layout->list[j].start;
+ layout->list[i].stop = layout->list[j].stop;
+ layout->list[i].xlator = layout->list[j].xlator;
+ layout->list[i].err = layout->list[j].err;
+
+ layout->list[j].start = start_swap;
+ layout->list[j].stop = stop_swap;
+ layout->list[j].xlator = xlator_swap;
+ layout->list[j].err = err_swap;
+}
+
+
+int64_t
+dht_layout_entry_cmp (dht_layout_t *layout, int i, int j)
+{
+ int64_t diff = 0;
+
+ if (layout->list[i].err || layout->list[j].err)
+ diff = layout->list[i].err - layout->list[j].err;
+ else
+ diff = (int64_t) layout->list[i].start
+ - (int64_t) layout->list[j].start;
+
+ return diff;
+}
+
+
+int
+dht_layout_sort (dht_layout_t *layout)
+{
+ int i = 0;
+ int j = 0;
+ int64_t ret = 0;
+
+ /* TODO: O(n^2) -- bad bad */
+
+ for (i = 0; i < layout->cnt - 1; i++) {
+ for (j = i + 1; j < layout->cnt; j++) {
+ ret = dht_layout_entry_cmp (layout, i, j);
+ if (ret > 0)
+ dht_layout_entry_swap (layout, i, j);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
+ uint32_t *holes_p, uint32_t *overlaps_p,
+ uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p)
+{
+ dht_conf_t *conf = NULL;
+ uint32_t holes = 0;
+ uint32_t overlaps = 0;
+ uint32_t missing = 0;
+ uint32_t down = 0;
+ uint32_t misc = 0;
+ uint32_t hole_cnt = 0;
+ uint32_t overlap_cnt = 0;
+ int i = 0;
+ int ret = 0;
+ uint32_t prev_stop = 0;
+ uint32_t last_stop = 0;
+ char is_virgin = 1;
+
+
+ conf = this->private;
+
+ /* TODO: explain WTF is happening */
+
+ last_stop = layout->list[0].start - 1;
+ prev_stop = last_stop;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err) {
+ switch (layout->list[i].err) {
+ case -1:
+ case ENOENT:
+ missing++;
+ break;
+ case ENOTCONN:
+ down++;
+ break;
+ default:
+ misc++;
+ }
+ continue;
+ }
+
+ is_virgin = 0;
+
+ if ((prev_stop + 1) < layout->list[i].start) {
+ hole_cnt++;
+ holes += (layout->list[i].start - (prev_stop + 1));
+ }
+
+ if ((prev_stop + 1) > layout->list[i].start) {
+ overlap_cnt++;
+ overlaps += ((prev_stop + 1) - layout->list[i].start);
+ }
+ prev_stop = layout->list[i].stop;
+ }
+
+ if ((last_stop - prev_stop) || is_virgin)
+ hole_cnt++;
+ holes += (last_stop - prev_stop);
+
+ if (holes_p)
+ *holes_p = hole_cnt;
+
+ if (overlaps_p)
+ *overlaps_p = overlap_cnt;
+
+ if (missing_p)
+ *missing_p = missing;
+
+ if (down_p)
+ *down_p = down;
+
+ if (misc_p)
+ *misc_p = misc;
+
+ return ret;
+}
+
+
+int
+dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout)
+{
+ int ret = 0;
+ uint32_t holes = 0;
+ uint32_t overlaps = 0;
+ uint32_t missing = 0;
+ uint32_t down = 0;
+ uint32_t misc = 0;
+
+
+ ret = dht_layout_sort (layout);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "sort failed?! how the ....");
+ goto out;
+ }
+
+ ret = dht_layout_anomalies (this, loc, layout,
+ &holes, &overlaps,
+ &missing, &down, &misc);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "error while finding anomalies in %s -- not good news",
+ loc->path);
+ goto out;
+ }
+
+ if (holes || overlaps) {
+ if (missing == layout->cnt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "directory %s looked up first time",
+ loc->path);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "found anomalies in %s. holes=%d overlaps=%d",
+ loc->path, holes, overlaps);
+ }
+ ret = 1;
+ }
+
+out:
+ return ret;
+}
+
+
+int
+dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+ loc_t *loc, dict_t *xattr)
+{
+ int idx = 0;
+ int pos = -1;
+ int ret = -1;
+ int32_t *disk_layout = NULL;
+ int32_t count = -1;
+ uint32_t start_off = -1;
+ uint32_t stop_off = -1;
+
+
+ for (idx = 0; idx < layout->cnt; idx++) {
+ if (layout->list[idx].xlator == subvol) {
+ pos = idx;
+ break;
+ }
+ }
+
+ if (pos == -1) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s - no layout info for subvolume %s",
+ loc->path, subvol->name);
+ ret = 1;
+ goto out;
+ }
+
+ if (xattr == NULL) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s - xattr dictionary is NULL",
+ loc->path);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_ptr (xattr, "trusted.glusterfs.dht",
+ VOID(&disk_layout));
+
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s - disk layout missing", loc->path);
+ ret = -1;
+ goto out;
+ }
+
+ count = ntoh32 (disk_layout[0]);
+ if (count != 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s - disk layout has invalid count %d",
+ loc->path, count);
+ ret = -1;
+ goto out;
+ }
+
+ start_off = ntoh32 (disk_layout[2]);
+ stop_off = ntoh32 (disk_layout[3]);
+
+ if ((layout->list[pos].start != start_off)
+ || (layout->list[pos].stop != stop_off)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvol: %s; inode layout - %"PRId32" - %"PRId32"; "
+ "disk layout - %"PRId32" - %"PRId32,
+ layout->list[pos].xlator->name,
+ layout->list[pos].start, layout->list[pos].stop,
+ start_off, stop_off);
+ ret = 1;
+ } else {
+ ret = 0;
+ }
+out:
+ return ret;
+}
+
diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c
new file mode 100644
index 000000000..9cc24ccf6
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-linkfile.c
@@ -0,0 +1,224 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "compat.h"
+#include "dht-common.h"
+
+
+
+int
+dht_linkfile_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno)
+{
+ dht_local_t *local = NULL;
+
+
+ local = frame->local;
+ local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
+ local->linkfile.inode,
+ &local->linkfile.stbuf);
+
+ return 0;
+}
+
+
+int
+dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ dict_t *xattr = NULL;
+ data_t *str_data = NULL;
+ int ret = -1;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret == -1)
+ goto err;
+
+ xattr = get_new_dict ();
+ if (!xattr) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->linkfile.xattr = dict_ref (xattr);
+ local->linkfile.inode = inode_ref (inode);
+
+ str_data = str_to_data (local->linkfile.srcvol->name);
+ if (!str_data) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ ret = dict_set (xattr, "trusted.glusterfs.dht.linkto", str_data);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to initialize linkfile data");
+ op_errno = EINVAL;
+ }
+ str_data = NULL;
+
+ local->linkfile.stbuf = *stbuf;
+
+ STACK_WIND (frame, dht_linkfile_xattr_cbk,
+ prev->this, prev->this->fops->setxattr,
+ &local->linkfile.loc, local->linkfile.xattr, 0);
+
+ return 0;
+
+err:
+ if (str_data) {
+ data_destroy (str_data);
+ str_data = NULL;
+ }
+
+ local->linkfile.linkfile_cbk (frame, cookie, this,
+ op_ret, op_errno, inode, stbuf);
+ return 0;
+}
+
+
+int
+dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
+ xlator_t *tovol, xlator_t *fromvol, loc_t *loc)
+{
+ dht_local_t *local = NULL;
+
+
+ local = frame->local;
+ local->linkfile.linkfile_cbk = linkfile_cbk;
+ local->linkfile.srcvol = tovol;
+ loc_copy (&local->linkfile.loc, loc);
+
+ STACK_WIND (frame, dht_linkfile_create_cbk,
+ fromvol, fromvol->fops->mknod, loc,
+ S_IFREG | DHT_LINKFILE_MODE, 0);
+
+ return 0;
+}
+
+
+int
+dht_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ subvol = prev->this;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "unlinking linkfile %s on %s failed (%s)",
+ local->loc.path, subvol->name, strerror (op_errno));
+ }
+
+ DHT_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+dht_linkfile_unlink (call_frame_t *frame, xlator_t *this,
+ xlator_t *subvol, loc_t *loc)
+{
+ call_frame_t *unlink_frame = NULL;
+ dht_local_t *unlink_local = NULL;
+
+ unlink_frame = copy_frame (frame);
+ if (!unlink_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ unlink_local = dht_local_init (unlink_frame);
+ if (!unlink_local) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ loc_copy (&unlink_local->loc, loc);
+
+ STACK_WIND (unlink_frame, dht_linkfile_unlink_cbk,
+ subvol, subvol->fops->unlink,
+ &unlink_local->loc);
+
+ return 0;
+err:
+ if (unlink_frame)
+ DHT_STACK_DESTROY (unlink_frame);
+
+ return -1;
+}
+
+
+xlator_t *
+dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct stat *stbuf,
+ dict_t *xattr)
+{
+ dht_conf_t *conf = NULL;
+ xlator_t *subvol = NULL;
+ void *volname = NULL;
+ int i = 0, ret = 0;
+
+
+ conf = this->private;
+
+ if (!xattr)
+ goto out;
+
+ ret = dict_get_ptr (xattr, "trusted.glusterfs.dht.linkto", &volname);
+
+ if ((-1 == ret) || !volname)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (strcmp (conf->subvolumes[i]->name, (char *)volname) == 0) {
+ subvol = conf->subvolumes[i];
+ break;
+ }
+ }
+
+out:
+ return subvol;
+}
+
+
diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c
new file mode 100644
index 000000000..e5532f1bc
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-rename.c
@@ -0,0 +1,562 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+/* TODO: link(oldpath, newpath) fails if newpath already exists. DHT should
+ * delete the newpath if it gets EEXISTS from link() call.
+ */
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "dht-common.h"
+#include "defaults.h"
+
+
+int
+dht_rename_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ call_frame_t *prev = NULL;
+
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret == -1) {
+ /* TODO: undo the damage */
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "rename %s -> %s on %s failed (%s)",
+ local->loc.path, local->loc2.path,
+ prev->this->name, strerror (op_errno));
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ } else {
+ /* TODO: construct proper stbuf for dir */
+ local->stbuf = *stbuf;
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->stbuf);
+ }
+
+ return 0;
+}
+
+
+
+int
+dht_rename_dir_do (call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+
+ conf = this->private;
+ local = frame->local;
+
+ if (local->op_ret == -1)
+ goto err;
+
+ local->call_cnt = conf->subvolume_cnt;
+ local->op_ret = 0;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND (frame, dht_rename_dir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->rename,
+ &local->loc, &local->loc2);
+ }
+
+ return 0;
+
+err:
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+ return 0;
+}
+
+
+int
+dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *entries)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+ call_frame_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret > 2) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir on %s for %s returned %d entries",
+ prev->this->name, local->loc.path, op_ret);
+ local->op_ret = -1;
+ local->op_errno = ENOTEMPTY;
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ dht_rename_dir_do (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+ call_frame_t *prev = NULL;
+
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "opendir on %s for %s failed (%s)",
+ prev->this->name, local->loc.path,
+ strerror (op_errno));
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_rename_readdir_cbk,
+ prev->this, prev->this->fops->readdir,
+ local->fd, 4096, 0);
+
+ return 0;
+
+err:
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ dht_rename_dir_do (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+dht_rename_dir (call_frame_t *frame, xlator_t *this)
+{
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int i = 0;
+ int op_errno = -1;
+
+
+ conf = frame->this->private;
+ local = frame->local;
+
+ local->call_cnt = conf->subvolume_cnt;
+
+ local->fd = fd_create (local->loc.inode, frame->root->pid);
+ if (!local->fd) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->op_ret = 0;
+
+ if (!local->dst_cached) {
+ dht_rename_dir_do (frame, this);
+ return 0;
+ }
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND (frame, dht_rename_opendir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->opendir,
+ &local->loc2, local->fd);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+ return 0;
+}
+
+
+int
+dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int this_call_cnt = 0;
+
+ local = frame->local;
+ prev = cookie;
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "unlink on %s failed (%s)",
+ prev->this->name, strerror (op_errno));
+ }
+
+ if (is_last_call (this_call_cnt))
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->stbuf);
+
+ return 0;
+}
+
+
+int
+dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ xlator_t *src_hashed = NULL;
+ xlator_t *src_cached = NULL;
+ xlator_t *dst_hashed = NULL;
+ xlator_t *dst_cached = NULL;
+ xlator_t *rename_subvol = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ src_hashed = local->src_hashed;
+ src_cached = local->src_cached;
+ dst_hashed = local->dst_hashed;
+ dst_cached = local->dst_cached;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "rename on %s failed (%s)", prev->this->name,
+ strerror (op_errno));
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ goto unwind;
+ }
+
+ /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk
+ * is called. since rename has already happened on rename_subvol,
+ * unlink should not be sent for oldpath (either linkfile or cached-file)
+ * on rename_subvol. */
+ if (src_cached == dst_cached)
+ rename_subvol = src_cached;
+ else
+ rename_subvol = dst_hashed;
+
+ /* TODO: delete files in background */
+
+ if (src_cached != dst_hashed && src_cached != dst_cached)
+ local->call_cnt++;
+
+ if (src_hashed != rename_subvol && src_hashed != src_cached)
+ local->call_cnt++;
+
+ if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached)
+ local->call_cnt++;
+
+ if (local->call_cnt == 0)
+ goto unwind;
+
+ if (src_cached != dst_hashed && src_cached != dst_cached) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "deleting old src datafile %s @ %s",
+ local->loc.path, src_cached->name);
+
+ STACK_WIND (frame, dht_rename_unlink_cbk,
+ src_cached, src_cached->fops->unlink,
+ &local->loc);
+ }
+
+ if (src_hashed != rename_subvol && src_hashed != src_cached) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "deleting old src linkfile %s @ %s",
+ local->loc.path, src_hashed->name);
+
+ STACK_WIND (frame, dht_rename_unlink_cbk,
+ src_hashed, src_hashed->fops->unlink,
+ &local->loc);
+ }
+
+ if (dst_cached
+ && (dst_cached != dst_hashed)
+ && (dst_cached != src_cached)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "deleting old dst datafile %s @ %s",
+ local->loc2.path, dst_cached->name);
+
+ STACK_WIND (frame, dht_rename_unlink_cbk,
+ dst_cached, dst_cached->fops->unlink,
+ &local->loc2);
+ }
+ return 0;
+
+unwind:
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->stbuf);
+
+ return 0;
+}
+
+
+int
+dht_do_rename (call_frame_t *frame)
+{
+ dht_local_t *local = NULL;
+ xlator_t *dst_hashed = NULL;
+ xlator_t *src_cached = NULL;
+ xlator_t *dst_cached = NULL;
+ xlator_t *this = NULL;
+ xlator_t *rename_subvol = NULL;
+
+
+ local = frame->local;
+ this = frame->this;
+
+ dst_hashed = local->dst_hashed;
+ dst_cached = local->dst_cached;
+ src_cached = local->src_cached;
+
+ if (src_cached == dst_cached)
+ rename_subvol = src_cached;
+ else
+ rename_subvol = dst_hashed;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "renaming %s => %s (%s)",
+ local->loc.path, local->loc2.path, rename_subvol->name);
+
+ STACK_WIND (frame, dht_rename_cbk,
+ rename_subvol, rename_subvol->fops->rename,
+ &local->loc, &local->loc2);
+
+ return 0;
+}
+
+
+int
+dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int this_call_cnt = 0;
+
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "link/file on %s failed (%s)",
+ prev->this->name, strerror (op_errno));
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ if (local->op_ret == -1)
+ goto unwind;
+
+ dht_do_rename (frame);
+ }
+
+ return 0;
+
+unwind:
+ DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->stbuf);
+
+ return 0;
+}
+
+
+int
+dht_rename_create_links (call_frame_t *frame)
+{
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ xlator_t *src_hashed = NULL;
+ xlator_t *src_cached = NULL;
+ xlator_t *dst_hashed = NULL;
+ xlator_t *dst_cached = NULL;
+ int call_cnt = 0;
+
+
+ local = frame->local;
+ this = frame->this;
+
+ src_hashed = local->src_hashed;
+ src_cached = local->src_cached;
+ dst_hashed = local->dst_hashed;
+ dst_cached = local->dst_cached;
+
+ if (src_cached == dst_cached)
+ goto nolinks;
+
+ if (dst_hashed != src_hashed && dst_hashed != src_cached)
+ call_cnt++;
+
+ if (src_cached != dst_hashed)
+ call_cnt++;
+
+ local->call_cnt = call_cnt;
+
+ if (dst_hashed != src_hashed && dst_hashed != src_cached) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "linkfile %s @ %s => %s",
+ local->loc.path, dst_hashed->name, src_cached->name);
+ dht_linkfile_create (frame, dht_rename_links_cbk,
+ src_cached, dst_hashed, &local->loc);
+ }
+
+ if (src_cached != dst_hashed) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "link %s => %s (%s)", local->loc.path,
+ local->loc2.path, src_cached->name);
+ STACK_WIND (frame, dht_rename_links_cbk,
+ src_cached, src_cached->fops->link,
+ &local->loc, &local->loc2);
+ }
+
+nolinks:
+ if (!call_cnt) {
+ /* skip to next step */
+ dht_do_rename (frame);
+ }
+
+ return 0;
+}
+
+
+int
+dht_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc)
+{
+ xlator_t *src_cached = NULL;
+ xlator_t *src_hashed = NULL;
+ xlator_t *dst_cached = NULL;
+ xlator_t *dst_hashed = NULL;
+ int op_errno = -1;
+ int ret = -1;
+ dht_local_t *local = NULL;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (oldloc, err);
+ VALIDATE_OR_GOTO (newloc, err);
+
+ src_hashed = dht_subvol_get_hashed (this, oldloc);
+ if (!src_hashed) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ oldloc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ src_cached = dht_subvol_get_cached (this, oldloc->inode);
+ if (!src_cached) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no cached subvolume for path=%s", oldloc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ dst_hashed = dht_subvol_get_hashed (this, newloc);
+ if (!dst_hashed) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ newloc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (newloc->inode)
+ dst_cached = dht_subvol_get_cached (this, newloc->inode);
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ ret = loc_copy (&local->loc, oldloc);
+ if (ret == -1) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ ret = loc_copy (&local->loc2, newloc);
+ if (ret == -1) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ local->src_hashed = src_hashed;
+ local->src_cached = src_cached;
+ local->dst_hashed = dst_hashed;
+ local->dst_cached = dst_cached;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "renaming %s (hash=%s/cache=%s) => %s (hash=%s/cache=%s)",
+ oldloc->path, src_hashed->name, src_cached->name,
+ newloc->path, dst_hashed->name,
+ dst_cached ? dst_cached->name : "<nul>");
+
+ if (S_ISDIR (oldloc->inode->st_mode)) {
+ dht_rename_dir (frame, this);
+ } else {
+ local->op_ret = 0;
+ dht_rename_create_links (frame);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
new file mode 100644
index 000000000..ee32b2253
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -0,0 +1,460 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "dht-common.h"
+
+
+int
+dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret)
+{
+ dht_local_t *local = NULL;
+
+
+ local = frame->local;
+ local->selfheal.dir_cbk (frame, NULL, frame->this, ret,
+ local->op_errno);
+
+ return 0;
+}
+
+
+int
+dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ xlator_t *subvol = NULL;
+ int i = 0;
+ dht_layout_t *layout = NULL;
+ int err = 0;
+ int this_call_cnt = 0;
+
+ local = frame->local;
+ layout = local->selfheal.layout;
+ prev = cookie;
+ subvol = prev->this;
+
+ if (op_ret == 0)
+ err = 0;
+ else
+ err = op_errno;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == subvol) {
+ layout->list[i].err = err;
+ break;
+ }
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ dht_selfheal_dir_finish (frame, this, 0);
+ }
+
+ return 0;
+}
+
+
+int
+dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
+ dht_layout_t *layout, int i)
+{
+ xlator_t *subvol = NULL;
+ dict_t *xattr = NULL;
+ int ret = 0;
+ xlator_t *this = NULL;
+ int32_t *disk_layout = NULL;
+
+
+ subvol = layout->list[i].xlator;
+ this = frame->this;
+
+ xattr = get_new_dict ();
+ if (!xattr) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ ret = dht_disk_layout_extract (this, layout, i, &disk_layout);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to extract disk layout");
+ goto err;
+ }
+
+ ret = dict_set_bin (xattr, "trusted.glusterfs.dht",
+ disk_layout, 4 * 4);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set xattr dictionary");
+ goto err;
+ }
+ disk_layout = NULL;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting hash range %u - %u (type %d) on subvolume %s for %s",
+ layout->list[i].start, layout->list[i].stop,
+ layout->type, subvol->name, loc->path);
+
+ dict_ref (xattr);
+
+ STACK_WIND (frame, dht_selfheal_dir_xattr_cbk,
+ subvol, subvol->fops->setxattr,
+ loc, xattr, 0);
+
+ dict_unref (xattr);
+
+ return 0;
+
+err:
+ if (xattr)
+ dict_destroy (xattr);
+
+ if (disk_layout)
+ FREE (disk_layout);
+
+ dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this,
+ -1, ENOMEM);
+ return 0;
+}
+
+
+int
+dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
+{
+ dht_local_t *local = NULL;
+ int missing_xattr = 0;
+ int i = 0;
+ int ret = 0;
+ xlator_t *this = NULL;
+
+ local = frame->local;
+ this = frame->this;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err != -1 || !layout->list[i].stop)
+ continue;
+ /* attr missing and layout present */
+ missing_xattr++;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%d subvolumes missing xattr for %s",
+ missing_xattr, loc->path);
+
+ if (missing_xattr == 0) {
+ dht_selfheal_dir_finish (frame, this, 0);
+ return 0;
+ }
+
+ local->call_cnt = missing_xattr;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err != -1 || !layout->list[i].stop)
+ continue;
+
+ ret = dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i);
+
+ if (--missing_xattr == 0)
+ break;
+ }
+ return 0;
+}
+
+
+int
+dht_selfheal_dir_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ call_frame_t *prev = NULL;
+ xlator_t *subvol = NULL;
+ int i = 0;
+ int this_call_cnt = 0;
+
+
+ local = frame->local;
+ layout = local->selfheal.layout;
+ prev = cookie;
+ subvol = prev->this;
+
+ if ((op_ret == 0) || (op_errno == EEXIST)) {
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == subvol) {
+ layout->list[i].err = -1;
+ break;
+ }
+ }
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ dht_selfheal_dir_xattr (frame, &local->loc, layout);
+ }
+
+ return 0;
+}
+
+
+int
+dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc,
+ dht_layout_t *layout, int force)
+{
+ int missing_dirs = 0;
+ int i = 0;
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+
+
+ local = frame->local;
+ this = frame->this;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err == ENOENT || force)
+ missing_dirs++;
+ }
+
+ if (missing_dirs == 0) {
+ dht_selfheal_dir_xattr (frame, loc, layout);
+ return 0;
+ }
+
+ local->call_cnt = missing_dirs;
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err == ENOENT || force) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "creating directory %s on subvol %s",
+ loc->path, layout->list[i].xlator->name);
+
+ STACK_WIND (frame, dht_selfheal_dir_mkdir_cbk,
+ layout->list[i].xlator,
+ layout->list[i].xlator->fops->mkdir,
+ loc, local->stbuf.st_mode);
+ }
+ }
+
+ return 0;
+}
+
+void
+dht_selfheal_fix_this_virgin (call_frame_t *frame, loc_t *loc,
+ dht_layout_t *layout)
+{
+ dht_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+ uint32_t chunk = 0;
+ int i = 0;
+ uint32_t start = 0;
+ int cnt = 0;
+ int err = 0;
+
+ this = frame->this;
+ conf = this->private;
+
+ for (i = 0; i < layout->cnt; i++) {
+ err = layout->list[i].err;
+ if (err == -1) {
+ cnt++;
+ }
+ }
+
+ chunk = ((unsigned long) 0xffffffff) / cnt;
+
+ start = 0;
+ for (i = 0; i < layout->cnt; i++) {
+ err = layout->list[i].err;
+ if (err == -1) {
+ layout->list[i].start = start;
+ layout->list[i].stop = start + chunk - 1;
+
+ start = start + chunk;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "gave fix: %u - %u on %s for %s",
+ layout->list[i].start, layout->list[i].stop,
+ layout->list[i].xlator->name, loc->path);
+ if (--cnt == 0) {
+ layout->list[i].stop = 0xffffffff;
+ break;
+ }
+ }
+ }
+}
+
+
+int
+dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc,
+ dht_layout_t *layout)
+{
+ dht_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+ dht_local_t *local = NULL;
+ int missing = -1;
+ int down = -1;
+ int holes = -1;
+ int ret = -1;
+ int i = -1;
+
+ this = frame->this;
+ conf = this->private;
+ local = frame->local;
+
+ missing = local->selfheal.missing;
+ down = local->selfheal.down;
+ holes = local->selfheal.hole_cnt;
+
+ if ((missing + down) == conf->subvolume_cnt) {
+ dht_selfheal_fix_this_virgin (frame, loc, layout);
+ ret = 0;
+ }
+
+ if (holes <= down) {
+ /* the down subvol might fill up the holes */
+ ret = 0;
+ }
+
+ for (i = 0; i < layout->cnt; i++) {
+ /* directory not present */
+ if (layout->list[i].err == ENOENT) {
+ ret = 0;
+ break;
+ }
+ }
+
+ /* TODO: give a fix to these non-virgins */
+
+ return ret;
+}
+
+
+int
+dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
+ loc_t *loc, dht_layout_t *layout)
+{
+ dht_local_t *local = NULL;
+ uint32_t holes = 0;
+ uint32_t overlaps = 0;
+ uint32_t missing = 0;
+ uint32_t down = 0;
+ uint32_t misc = 0;
+ int ret = 0;
+ xlator_t *this = NULL;
+
+
+ local = frame->local;
+ this = frame->this;
+
+ ret = dht_layout_anomalies (this, loc, layout,
+ &local->selfheal.hole_cnt,
+ &local->selfheal.overlaps_cnt,
+ &local->selfheal.missing,
+ &local->selfheal.down,
+ &local->selfheal.misc);
+
+ holes = local->selfheal.hole_cnt;
+ overlaps = local->selfheal.overlaps_cnt;
+ missing = local->selfheal.missing;
+ down = local->selfheal.down;
+ misc = local->selfheal.misc;
+
+ local->selfheal.dir_cbk = dir_cbk;
+ local->selfheal.layout = layout;
+
+/*
+ if (down) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%d subvolumes down -- not fixing", down);
+ ret = 0;
+ goto sorry_no_fix;
+ }
+
+ if (overlaps) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "not fixing overlaps in %s", loc->path);
+ local->op_errno = EINVAL;
+ ret = -1;
+ goto sorry_no_fix;
+ }
+
+ if (misc) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%d subvolumes have unrecoverable errors", misc);
+ ret = 0;
+ goto sorry_no_fix;
+ }
+
+ if (holes > missing) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%d holes and %d pigeons -- not fixing",
+ holes, missing);
+ ret = 0;
+ goto sorry_no_fix;
+ }
+*/
+ ret = dht_selfheal_dir_getafix (frame, loc, layout);
+
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "the directory is not a virgin");
+ goto sorry_no_fix;
+ }
+
+ dht_selfheal_dir_mkdir (frame, loc, layout, 0);
+
+ return 0;
+
+sorry_no_fix:
+ /* TODO: need to put appropriate local->op_errno */
+ dht_selfheal_dir_finish (frame, this, ret);
+
+ return 0;
+}
+
+
+int
+dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
+ loc_t *loc, dht_layout_t *layout)
+{
+ int ret = 0;
+ dht_local_t *local = NULL;
+
+
+ local = frame->local;
+
+ local->selfheal.dir_cbk = dir_cbk;
+ local->selfheal.layout = layout;
+
+ ret = dht_selfheal_dir_mkdir (frame, loc, layout, 1);
+
+ return 0;
+}
diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c
new file mode 100644
index 000000000..836e7a4e8
--- /dev/null
+++ b/xlators/cluster/dht/src/dht.c
@@ -0,0 +1,222 @@
+/*
+ Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+/* TODO: add NS locking */
+
+#include "dht-common.c"
+
+/* TODO:
+ - use volumename in xattr instead of "dht"
+ - use NS locks
+ - handle all cases in self heal layout reconstruction
+ - complete linkfile selfheal
+*/
+
+
+
+int
+notify (xlator_t *this, int event, void *data, ...)
+{
+ int ret = -1;
+
+ ret = dht_notify (this, event, data);
+
+ return ret;
+}
+
+void
+fini (xlator_t *this)
+{
+ int i = 0;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (conf) {
+ if (conf->file_layouts) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ FREE (conf->file_layouts[i]);
+ }
+ FREE (conf->file_layouts);
+ }
+
+ if (conf->default_dir_layout)
+ FREE (conf->default_dir_layout);
+
+ if (conf->subvolumes)
+ FREE (conf->subvolumes);
+
+ if (conf->subvolume_status)
+ FREE (conf->subvolume_status);
+
+ FREE (conf);
+ }
+
+ return;
+}
+
+int
+init (xlator_t *this)
+{
+ dht_conf_t *conf = NULL;
+ char *lookup_unhashed_str = NULL;
+ int ret = -1;
+ int i = 0;
+
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "DHT needs more than one child defined");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ conf = CALLOC (1, sizeof (*conf));
+ if (!conf) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ conf->search_unhashed = 0;
+
+ if (dict_get_str (this->options, "lookup-unhashed",
+ &lookup_unhashed_str) == 0) {
+ gf_string2boolean (lookup_unhashed_str,
+ &conf->search_unhashed);
+ }
+
+ ret = dht_init_subvolumes (this, conf);
+ if (ret == -1) {
+ goto err;
+ }
+
+ ret = dht_layouts_init (this, conf);
+ if (ret == -1) {
+ goto err;
+ }
+
+ LOCK_INIT (&conf->subvolume_lock);
+
+ conf->gen = 1;
+
+ this->private = conf;
+
+ return 0;
+
+err:
+ if (conf) {
+ if (conf->file_layouts) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ FREE (conf->file_layouts[i]);
+ }
+ FREE (conf->file_layouts);
+ }
+
+ if (conf->default_dir_layout)
+ FREE (conf->default_dir_layout);
+
+ if (conf->subvolumes)
+ FREE (conf->subvolumes);
+
+ if (conf->subvolume_status)
+ FREE (conf->subvolume_status);
+
+ FREE (conf);
+ }
+
+ return -1;
+}
+
+
+struct xlator_fops fops = {
+ .lookup = dht_lookup,
+ .mknod = dht_mknod,
+ .create = dht_create,
+
+ .stat = dht_stat,
+ .chmod = dht_chmod,
+ .chown = dht_chown,
+ .fchown = dht_fchown,
+ .fchmod = dht_fchmod,
+ .fstat = dht_fstat,
+ .utimens = dht_utimens,
+ .truncate = dht_truncate,
+ .ftruncate = dht_ftruncate,
+ .access = dht_access,
+ .readlink = dht_readlink,
+ .setxattr = dht_setxattr,
+ .getxattr = dht_getxattr,
+ .removexattr = dht_removexattr,
+ .open = dht_open,
+ .readv = dht_readv,
+ .writev = dht_writev,
+ .flush = dht_flush,
+ .fsync = dht_fsync,
+ .statfs = dht_statfs,
+ .lk = dht_lk,
+ .opendir = dht_opendir,
+ .readdir = dht_readdir,
+ .fsyncdir = dht_fsyncdir,
+ .symlink = dht_symlink,
+ .unlink = dht_unlink,
+ .link = dht_link,
+ .mkdir = dht_mkdir,
+ .rmdir = dht_rmdir,
+ .rename = dht_rename,
+ .inodelk = dht_inodelk,
+ .finodelk = dht_finodelk,
+ .entrylk = dht_entrylk,
+ .fentrylk = dht_fentrylk,
+ .xattrop = dht_xattrop,
+ .fxattrop = dht_fxattrop,
+#if 0
+ .setdents = dht_setdents,
+ .getdents = dht_getdents,
+ .checksum = dht_checksum,
+#endif
+};
+
+
+struct xlator_mops mops = {
+};
+
+
+struct xlator_cbks cbks = {
+// .release = dht_release,
+// .releasedir = dht_releasedir,
+ .forget = dht_forget
+};
+
+
+struct volume_options options[] = {
+ { .key = {"lookup-unhashed"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c
new file mode 100644
index 000000000..6333e002f
--- /dev/null
+++ b/xlators/cluster/dht/src/nufa.c
@@ -0,0 +1,684 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "dht-common.c"
+
+/* TODO: all 'TODO's in dht.c holds good */
+
+int
+nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf, dict_t *xattr)
+{
+ dht_layout_t *layout = NULL;
+ xlator_t *subvol = NULL;
+ char is_linkfile = 0;
+ char is_dir = 0;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ loc_t *loc = NULL;
+ int i = 0;
+ call_frame_t *prev = NULL;
+ int call_cnt = 0;
+
+
+ conf = this->private;
+
+ prev = cookie;
+ local = frame->local;
+ loc = &local->loc;
+
+ if (ENTRY_MISSING (op_ret, op_errno)) {
+ if (conf->search_unhashed) {
+ local->op_errno = ENOENT;
+ dht_lookup_everywhere (frame, this, loc);
+ return 0;
+ }
+ }
+
+ if (op_ret == -1)
+ goto out;
+
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_dir = check_is_dir (inode, stbuf, xattr);
+
+ if (!is_dir && !is_linkfile) {
+ /* non-directory and not a linkfile */
+
+ dht_itransform (this, prev->this, stbuf->st_ino,
+ &stbuf->st_ino);
+
+ layout = dht_layout_for_subvol (this, prev->this);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no pre-set layout for subvolume %s",
+ prev->this->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ inode_ctx_put (inode, this, (uint64_t)(long)layout);
+ goto out;
+ }
+
+ if (is_dir) {
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+
+ local->inode = inode_ref (inode);
+ local->xattr = dict_ref (xattr);
+
+ local->op_ret = 0;
+ local->op_errno = 0;
+
+ local->layout = dht_layout_new (this, conf->subvolume_cnt);
+ if (!local->layout) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND (frame, dht_lookup_dir_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup,
+ &local->loc, local->xattr_req);
+ }
+ }
+
+ if (is_linkfile) {
+ subvol = dht_linkfile_subvol (this, inode, stbuf, xattr);
+
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "linkfile not having link subvolume. path=%s",
+ loc->path);
+ dht_lookup_everywhere (frame, this, loc);
+ return 0;
+ }
+
+ STACK_WIND (frame, dht_lookup_linkfile_cbk,
+ subvol, subvol->fops->lookup,
+ &local->loc, local->xattr_req);
+ }
+
+ return 0;
+
+out:
+ if (!local->hashed_subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ local->loc.path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_lookup_cbk,
+ local->hashed_subvol, local->hashed_subvol->fops->lookup,
+ &local->loc, local->xattr_req);
+
+ return 0;
+
+ err:
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr);
+ return 0;
+}
+
+int
+nufa_lookup (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *xattr_req)
+{
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *cached_subvol = NULL;
+ xlator_t *subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int op_errno = -1;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int call_cnt = 0;
+
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ ret = loc_dup (loc, &local->loc);
+ if (ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "copying location failed for path=%s",
+ loc->path);
+ goto err;
+ }
+
+ if (xattr_req) {
+ local->xattr_req = dict_ref (xattr_req);
+ } else {
+ local->xattr_req = dict_new ();
+ }
+
+ hashed_subvol = dht_subvol_get_hashed (this, &local->loc);
+ cached_subvol = dht_subvol_get_cached (this, local->loc.inode);
+
+ local->cached_subvol = cached_subvol;
+ local->hashed_subvol = hashed_subvol;
+
+ if (is_revalidate (loc)) {
+ layout = dht_layout_get (this, loc->inode);
+
+ if (!layout) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "revalidate without cache. path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (layout->gen && (layout->gen < conf->gen)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "incomplete layout failure for path=%s",
+ loc->path);
+ op_errno = EAGAIN;
+ goto err;
+ }
+
+ local->inode = inode_ref (loc->inode);
+ local->st_ino = loc->inode->ino;
+
+ local->call_cnt = layout->cnt;
+ call_cnt = local->call_cnt;
+
+ /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute,
+ * revalidates directly go to the cached-subvolume.
+ */
+ ret = dict_set_uint32 (local->xattr_req,
+ "trusted.glusterfs.dht", 4 * 4);
+
+ for (i = 0; i < layout->cnt; i++) {
+ subvol = layout->list[i].xlator;
+
+ STACK_WIND (frame, dht_revalidate_cbk,
+ subvol, subvol->fops->lookup,
+ loc, local->xattr_req);
+
+ if (!--call_cnt)
+ break;
+ }
+ } else {
+ ret = dict_set_uint32 (local->xattr_req,
+ "trusted.glusterfs.dht", 4 * 4);
+
+ ret = dict_set_uint32 (local->xattr_req,
+ "trusted.glusterfs.dht.linkto", 256);
+
+ /* Send it to only local volume */
+ STACK_WIND (frame, nufa_local_lookup_cbk,
+ conf->local_volume,
+ conf->local_volume->fops->lookup,
+ loc, local->xattr_req);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+nufa_create_linkfile_create_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ dht_conf_t *conf = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ if (op_ret == -1)
+ goto err;
+
+ STACK_WIND (frame, dht_create_cbk,
+ conf->local_volume, conf->local_volume->fops->create,
+ &local->loc, local->flags, local->mode, local->fd);
+
+ return 0;
+
+ err:
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+nufa_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode, fd_t *fd)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ int ret = -1;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+
+ conf = this->private;
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ subvol = dht_subvol_get_hashed (this, loc);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ if (subvol != conf->local_volume) {
+ /* create a link file instead of actual file */
+ ret = loc_copy (&local->loc, loc);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->fd = fd_ref (fd);
+ local->mode = mode;
+ local->flags = flags;
+
+ dht_linkfile_create (frame, nufa_create_linkfile_create_cbk,
+ conf->local_volume, subvol, loc);
+ return 0;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "creating %s on %s", loc->path, subvol->name);
+
+ STACK_WIND (frame, dht_create_cbk,
+ subvol, subvol->fops->create,
+ loc, flags, mode, fd);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+nufa_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ dht_conf_t *conf = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ if (op_ret >= 0) {
+ STACK_WIND (frame, dht_newfile_cbk,
+ conf->local_volume,
+ conf->local_volume->fops->mknod,
+ &local->loc, local->mode, local->rdev);
+
+ return 0;
+ }
+
+ DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf);
+ return 0;
+}
+
+
+int
+nufa_mknod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, dev_t rdev)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ int ret = -1;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+
+ conf = this->private;
+
+
+ local = dht_local_init (frame);
+ if (!local) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ subvol = dht_subvol_get_hashed (this, loc);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+
+ if (conf->local_volume != subvol) {
+ /* Create linkfile first */
+ ret = loc_copy (&local->loc, loc);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->mode = mode;
+ local->rdev = rdev;
+
+ dht_linkfile_create (frame, nufa_mknod_linkfile_cbk,
+ conf->local_volume, subvol, loc);
+ return 0;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "creating %s on %s", loc->path, subvol->name);
+
+ STACK_WIND (frame, dht_newfile_cbk,
+ subvol, subvol->fops->mknod,
+ loc, mode, rdev);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int
+notify (xlator_t *this, int event, void *data, ...)
+{
+ int ret = -1;
+
+ ret = dht_notify (this, event, data);
+
+ return ret;
+}
+
+void
+fini (xlator_t *this)
+{
+ int i = 0;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (conf) {
+ if (conf->file_layouts) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ FREE (conf->file_layouts[i]);
+ }
+ FREE (conf->file_layouts);
+ }
+
+ if (conf->default_dir_layout)
+ FREE (conf->default_dir_layout);
+
+ if (conf->subvolumes)
+ FREE (conf->subvolumes);
+
+ if (conf->subvolume_status)
+ FREE (conf->subvolume_status);
+
+ FREE (conf);
+ }
+
+ return;
+}
+
+int
+init (xlator_t *this)
+{
+ dht_conf_t *conf = NULL;
+ xlator_list_t *trav = NULL;
+ data_t *data = NULL;
+ char *local_volname = NULL;
+ char *lookup_unhashed_str = NULL;
+ int ret = -1;
+ int i = 0;
+ char my_hostname[256];
+
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "DHT needs more than one child defined");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ conf = CALLOC (1, sizeof (*conf));
+ if (!conf) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto err;
+ }
+
+ conf->search_unhashed = 0;
+
+ if (dict_get_str (this->options, "lookup-unhashed",
+ &lookup_unhashed_str) == 0) {
+ gf_string2boolean (lookup_unhashed_str,
+ &conf->search_unhashed);
+ }
+
+ ret = dht_init_subvolumes (this, conf);
+ if (ret == -1) {
+ goto err;
+ }
+
+ ret = dht_layouts_init (this, conf);
+ if (ret == -1) {
+ goto err;
+ }
+
+ LOCK_INIT (&conf->subvolume_lock);
+
+ conf->gen = 1;
+
+ local_volname = "localhost";
+ ret = gethostname (my_hostname, 256);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "could not find hostname (%s)",
+ strerror (errno));
+ }
+
+ if (ret == 0)
+ local_volname = my_hostname;
+
+ data = dict_get (this->options, "local-volume-name");
+ if (data) {
+ local_volname = data->data;
+ }
+
+ trav = this->children;
+ while (trav) {
+ if (strcmp (trav->xlator->name, local_volname) == 0)
+ break;
+ trav = trav->next;
+ }
+
+ if (!trav) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not find subvolume named '%s'. "
+ "Please define volume with the name as the hostname "
+ "or override it with 'option local-volume-name'",
+ local_volname);
+ goto err;
+ }
+ /* The volume specified exists */
+ conf->local_volume = trav->xlator;
+
+ this->private = conf;
+
+ return 0;
+
+err:
+ if (conf) {
+ if (conf->file_layouts) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ FREE (conf->file_layouts[i]);
+ }
+ FREE (conf->file_layouts);
+ }
+
+ if (conf->default_dir_layout)
+ FREE (conf->default_dir_layout);
+
+ if (conf->subvolumes)
+ FREE (conf->subvolumes);
+
+ if (conf->subvolume_status)
+ FREE (conf->subvolume_status);
+
+ FREE (conf);
+ }
+
+ return -1;
+}
+
+
+struct xlator_fops fops = {
+ .lookup = nufa_lookup,
+ .create = nufa_create,
+ .mknod = nufa_mknod,
+
+ .stat = dht_stat,
+ .chmod = dht_chmod,
+ .chown = dht_chown,
+ .fchown = dht_fchown,
+ .fchmod = dht_fchmod,
+ .fstat = dht_fstat,
+ .utimens = dht_utimens,
+ .truncate = dht_truncate,
+ .ftruncate = dht_ftruncate,
+ .access = dht_access,
+ .readlink = dht_readlink,
+ .setxattr = dht_setxattr,
+ .getxattr = dht_getxattr,
+ .removexattr = dht_removexattr,
+ .open = dht_open,
+ .readv = dht_readv,
+ .writev = dht_writev,
+ .flush = dht_flush,
+ .fsync = dht_fsync,
+ .statfs = dht_statfs,
+ .lk = dht_lk,
+ .opendir = dht_opendir,
+ .readdir = dht_readdir,
+ .fsyncdir = dht_fsyncdir,
+ .symlink = dht_symlink,
+ .unlink = dht_unlink,
+ .link = dht_link,
+ .mkdir = dht_mkdir,
+ .rmdir = dht_rmdir,
+ .rename = dht_rename,
+ .inodelk = dht_inodelk,
+ .finodelk = dht_finodelk,
+ .entrylk = dht_entrylk,
+ .fentrylk = dht_fentrylk,
+ .xattrop = dht_xattrop,
+ .fxattrop = dht_fxattrop,
+#if 0
+ .setdents = dht_setdents,
+ .getdents = dht_getdents,
+ .checksum = dht_checksum,
+#endif
+};
+
+
+struct xlator_mops mops = {
+};
+
+
+struct xlator_cbks cbks = {
+// .release = dht_release,
+// .releasedir = dht_releasedir,
+ .forget = dht_forget
+};
+
+
+struct volume_options options[] = {
+ { .key = {"local-volume-name"},
+ .type = GF_OPTION_TYPE_XLATOR
+ },
+ { .key = {"lookup-unhashed"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/ha/Makefile.am b/xlators/cluster/ha/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/cluster/ha/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/ha/src/Makefile.am b/xlators/cluster/ha/src/Makefile.am
new file mode 100644
index 000000000..069a0dcde
--- /dev/null
+++ b/xlators/cluster/ha/src/Makefile.am
@@ -0,0 +1,15 @@
+xlator_LTLIBRARIES = ha.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+ha_la_LDFLAGS = -module -avoidversion
+
+ha_la_SOURCES = ha-helpers.c ha.c
+ha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = ha.h
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES =
+
diff --git a/xlators/cluster/ha/src/ha-helpers.c b/xlators/cluster/ha/src/ha-helpers.c
new file mode 100644
index 000000000..8193caf27
--- /dev/null
+++ b/xlators/cluster/ha/src/ha-helpers.c
@@ -0,0 +1,191 @@
+/*
+ Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include "xlator.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "dict.h"
+#include "compat-errno.h"
+#include "ha.h"
+
+int ha_alloc_init_fd (call_frame_t *frame, fd_t *fd)
+{
+ ha_local_t *local = NULL;
+ int i = -1;
+ ha_private_t *pvt = NULL;
+ int child_count = 0;
+ int ret = -1;
+ hafd_t *hafdp = NULL;
+ xlator_t *this = NULL;
+ uint64_t tmp_hafdp = 0;
+
+ this = frame->this;
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+
+ if (local == NULL) {
+ ret = fd_ctx_get (fd, this, &tmp_hafdp);
+ if (ret < 0) {
+ goto out;
+ }
+ hafdp = (hafd_t *)(long)tmp_hafdp;
+ local = frame->local = CALLOC (1, sizeof (*local));
+ if (local == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ local->state = CALLOC (1, child_count);
+ if (local->state == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* take care of the preferred subvolume */
+ if (pvt->pref_subvol == -1)
+ local->active = hafdp->active;
+ else
+ local->active = pvt->pref_subvol;
+
+ LOCK (&hafdp->lock);
+ memcpy (local->state, hafdp->fdstate, child_count);
+ UNLOCK (&hafdp->lock);
+
+ /* in case the preferred subvolume is down */
+ if ((local->active != -1) && (local->state[local->active] == 0))
+ local->active = -1;
+
+ for (i = 0; i < child_count; i++) {
+ if (local->state[i]) {
+ if (local->active == -1)
+ local->active = i;
+ local->tries++;
+ }
+ }
+ if (local->active == -1) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+ local->fd = fd_ref (fd);
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+int ha_handle_cbk (call_frame_t *frame, void *cookie, int op_ret, int op_errno)
+{
+ xlator_t *xl = NULL;
+ ha_private_t *pvt = NULL;
+ xlator_t **children = NULL;
+ int prev_child = -1;
+ hafd_t *hafdp = NULL;
+ int ret = -1;
+ call_stub_t *stub = NULL;
+ ha_local_t *local = NULL;
+ uint64_t tmp_hafdp = 0;
+
+ xl = frame->this;
+ pvt = xl->private;
+ children = pvt->children;
+ prev_child = (long) cookie;
+ local = frame->local;
+
+ if (op_ret == -1) {
+ gf_log (xl->name, GF_LOG_ERROR ,"(child=%s) (op_ret=%d op_errno=%s)",
+ children[prev_child]->name, op_ret, strerror (op_errno));
+ }
+ if (op_ret == -1 && (op_errno == ENOTCONN)) {
+ ret = 0;
+ if (local->fd) {
+ ret = fd_ctx_get (local->fd, xl, &tmp_hafdp);
+ }
+ hafdp = (hafd_t *)(long)tmp_hafdp;
+ if (ret == 0) {
+ if (local->fd) {
+ LOCK(&hafdp->lock);
+ hafdp->fdstate[prev_child] = 0;
+ UNLOCK(&hafdp->lock);
+ }
+ local->tries--;
+ if (local->tries != 0) {
+ while (1) {
+ local->active = (local->active + 1) % pvt->child_count;
+ if (local->state[local->active])
+ break;
+ }
+ stub = local->stub;
+ local->stub = NULL;
+ call_resume (stub);
+ return -1;
+ }
+ }
+ }
+ if (local->stub)
+ call_stub_destroy (local->stub);
+ if (local->fd) {
+ FREE (local->state);
+ fd_unref (local->fd);
+ }
+ return 0;
+}
+
+int ha_alloc_init_inode (call_frame_t *frame, inode_t *inode)
+{
+ int i = -1;
+ ha_private_t *pvt = NULL;
+ xlator_t *xl = NULL;
+ int ret = -1;
+ ha_local_t *local = NULL;
+ uint64_t tmp_state = 0;
+
+ xl = frame->this;
+ pvt = xl->private;
+ local = frame->local;
+
+ if (local == NULL) {
+ local = frame->local = CALLOC (1, sizeof (*local));
+ if (local == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ local->active = pvt->pref_subvol;
+ ret = inode_ctx_get (inode, xl, &tmp_state);
+ if (ret < 0) {
+ goto out;
+ }
+ local->state = (char *)(long)tmp_state;
+ if (local->active != -1 && local->state[local->active] == 0)
+ local->active = -1;
+ for (i = 0; i < pvt->child_count; i++) {
+ if (local->state[i]) {
+ if (local->active == -1)
+ local->active = i;
+ local->tries++;
+ }
+ }
+ if (local->active == -1) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
diff --git a/xlators/cluster/ha/src/ha.c b/xlators/cluster/ha/src/ha.c
new file mode 100644
index 000000000..4542bdc7e
--- /dev/null
+++ b/xlators/cluster/ha/src/ha.c
@@ -0,0 +1,3479 @@
+/*
+ Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+/* generate errors randomly, code is simple now, better alogorithm
+ * can be written to decide what error to be returned and when
+ */
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "dict.h"
+#include "compat-errno.h"
+#include "ha.h"
+
+/*
+ * TODO:
+ * - dbench fails if ha over server side afr
+ * - lock calls - lock on all subvols.
+ * - support preferred-subvolume option. code already there.
+ * - do not alloc the call-stub in case only one subvol is up.
+ */
+
+int
+ha_forget (xlator_t *this,
+ inode_t *inode)
+{
+ uint64_t stateino = 0;
+ char *state = NULL;
+ if (!inode_ctx_del (inode, this, &stateino)) {
+ state = ((char *)(long)stateino);
+ FREE (state);
+ }
+
+ return 0;
+
+}
+
+int32_t
+ha_lookup_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf,
+ dict_t *dict)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ int child_count = 0, i = 0, callcnt = 0;
+ char *state = NULL;
+ call_frame_t *prev_frame = NULL;
+ xlator_t **children = NULL;
+ uint64_t tmp_state = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+ prev_frame = cookie;
+ children = pvt->children;
+
+ for (i = 0; i < child_count; i++) {
+ if (pvt->children[i] == prev_frame->this)
+ break;
+ }
+ if ((op_ret == -1) && (op_errno != ENOENT)) {
+ gf_log (this->name, GF_LOG_ERROR, "(child=%s) (op_ret=%d op_errno=%s)",
+ children[i]->name, op_ret, strerror (op_errno));
+ }
+ inode_ctx_get (local->inode, this, &tmp_state);
+ state = (char *)(long)tmp_state;
+
+ LOCK (&frame->lock);
+ if (local->revalidate == 1) {
+ if ((!op_ret) != state[i]) {
+ local->revalidate_error = 1;
+ gf_log (this->name, GF_LOG_DEBUG, "revalidate error on %s",
+ pvt->children[i]->name);
+ }
+ } else {
+ if (op_ret == 0) {
+ state[i] = 1;
+ }
+ }
+ if (local->op_ret == -1 && op_ret == 0) {
+ local->op_ret = 0;
+ local->buf = *buf;
+ if (dict)
+ local->dict = dict_ref (dict);
+ }
+ if (op_ret == -1 && op_ret != ENOTCONN)
+ local->op_errno = op_errno;
+ callcnt = --local->call_count;
+ UNLOCK (&frame->lock);
+
+ if (callcnt == 0) {
+ dict_t *ctx = local->dict;
+ inode_t *inode = local->inode;
+ if (local->revalidate_error == 1) {
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ gf_log (this->name, GF_LOG_DEBUG, "revalidate error, returning EIO");
+ }
+ STACK_UNWIND (frame,
+ local->op_ret,
+ local->op_errno,
+ inode,
+ &local->buf,
+ ctx);
+ if (inode)
+ inode_unref (inode);
+ if (ctx)
+ dict_unref (ctx);
+ }
+ return 0;
+}
+
+int32_t
+ha_lookup (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ dict_t *xattr_req)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ int child_count = 0, i = 0;
+ char *state = NULL;
+ xlator_t **children = NULL;
+ int ret = -1;
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+ children = pvt->children;
+
+ frame->local = local = CALLOC (1, sizeof (*local));
+ child_count = pvt->child_count;
+ local->inode = inode_ref (loc->inode);
+
+ ret = inode_ctx_get (loc->inode, this, NULL);
+ if (ret) {
+ state = CALLOC (1, child_count);
+ inode_ctx_put (loc->inode, this, (uint64_t)(long)state);
+ } else
+ local->revalidate = 1;
+
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ local->call_count = child_count;
+
+ for (i = 0; i < child_count; i++) {
+ STACK_WIND (frame,
+ ha_lookup_cbk,
+ children[i],
+ children[i]->fops->lookup,
+ loc,
+ xattr_req);
+ }
+ return 0;
+}
+
+ int32_t
+ha_stat_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ buf);
+ }
+ return 0;
+}
+
+int32_t
+ha_stat (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ ha_local_t *local = NULL;
+ int op_errno = ENOTCONN;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_stat_stub (frame, ha_stat, loc);
+
+ STACK_WIND_COOKIE (frame,
+ ha_stat_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->stat,
+ loc);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+ return 0;
+}
+
+ int32_t
+ha_chmod_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ buf);
+ }
+ return 0;
+}
+
+int32_t
+ha_chmod (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ mode_t mode)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_chmod_stub (frame, ha_chmod, loc, mode);
+
+ STACK_WIND_COOKIE (frame,
+ ha_chmod_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->chmod,
+ loc,
+ mode);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+ return 0;
+}
+
+ int32_t
+ha_fchmod_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ buf);
+ }
+ return 0;
+}
+
+int32_t
+ha_fchmod (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ mode_t mode)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_fd (frame, fd);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_fchmod_stub (frame, ha_fchmod, fd, mode);
+
+ STACK_WIND_COOKIE (frame,
+ ha_fchmod_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->fchmod,
+ fd,
+ mode);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+ return 0;
+}
+
+ int32_t
+ha_chown_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ buf);
+ }
+ return 0;
+}
+
+int32_t
+ha_chown (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ uid_t uid,
+ gid_t gid)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_chown_stub (frame, ha_chown, loc, uid, gid);
+
+ STACK_WIND_COOKIE (frame,
+ ha_chown_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->chown,
+ loc,
+ uid,
+ gid);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, ENOTCONN, NULL);
+ return 0;
+}
+
+ int32_t
+ha_fchown_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ buf);
+ }
+ return 0;
+}
+
+int32_t
+ha_fchown (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ uid_t uid,
+ gid_t gid)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_fd (frame, fd);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_fchown_stub (frame, ha_fchown, fd, uid, gid);
+
+ STACK_WIND_COOKIE (frame,
+ ha_fchown_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->fchown,
+ fd,
+ uid,
+ gid);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+ return 0;
+}
+
+ int32_t
+ha_truncate_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ buf);
+ }
+ return 0;
+}
+
+int32_t
+ha_truncate (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ off_t offset)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_truncate_stub (frame, ha_truncate, loc, offset);
+
+ STACK_WIND_COOKIE (frame,
+ ha_truncate_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->truncate,
+ loc,
+ offset);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+ return 0;
+}
+
+ int32_t
+ha_ftruncate_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ buf);
+ }
+ return 0;
+}
+
+int32_t
+ha_ftruncate (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ off_t offset)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_fd (frame, fd);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_ftruncate_stub (frame, ha_ftruncate, fd, offset);
+
+ STACK_WIND_COOKIE (frame,
+ ha_ftruncate_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->ftruncate,
+ fd,
+ offset);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+ return 0;
+}
+
+int32_t
+ha_utimens_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ buf);
+ }
+ return 0;
+}
+
+int32_t
+ha_utimens (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ struct timespec tv[2])
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_utimens_stub (frame, ha_utimens, loc, tv);
+
+ STACK_WIND_COOKIE (frame,
+ ha_utimens_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->utimens,
+ loc,
+ tv);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+ return 0;
+}
+
+int32_t
+ha_access_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno);
+ }
+ return 0;
+}
+
+int32_t
+ha_access (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t mask)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_access_stub (frame, ha_access, loc, mask);
+
+ STACK_WIND_COOKIE (frame,
+ ha_access_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->access,
+ loc,
+ mask);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno);
+ return 0;
+}
+
+
+ int32_t
+ha_readlink_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ const char *path)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ path);
+ }
+ return 0;
+}
+
+int32_t
+ha_readlink (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ size_t size)
+{
+ ha_local_t *local = frame->local;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_readlink_stub (frame, ha_readlink, loc, size);
+
+ STACK_WIND_COOKIE (frame,
+ ha_readlink_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->readlink,
+ loc,
+ size);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+ return 0;
+}
+
+int
+ha_mknod_lookup_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf,
+ dict_t *dict)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ char *stateino = NULL;
+ int child_count = 0, i = 0, cnt = 0, ret = 0;
+ call_frame_t *prev_frame = NULL;
+ xlator_t **children = NULL;
+ uint64_t tmp_stateino = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+ prev_frame = cookie;
+ children = pvt->children;
+
+ for (i = 0; i < child_count; i++)
+ if (prev_frame->this == children[i])
+ break;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "(path=%s) (op_ret=%d op_errno=%d)",
+ local->stub->args.mknod.loc.path, op_ret, op_errno);
+ }
+ ret = inode_ctx_get (local->stub->args.mknod.loc.inode,
+ this, &tmp_stateino);
+ stateino = (char *)(long)tmp_stateino;
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "unwind(-1), inode_ctx_get() error");
+ /* It is difficult to handle this error at this stage
+ * as we still expect more cbks, we can't return as
+ * of now
+ */
+ } else if (op_ret == 0) {
+ stateino[i] = 1;
+ }
+ LOCK (&frame->lock);
+ cnt = --local->call_count;
+ UNLOCK (&frame->lock);
+
+ if (cnt == 0) {
+ call_stub_t *stub = local->stub;
+ FREE (local->state);
+ STACK_UNWIND (frame,
+ local->op_ret,
+ local->op_errno,
+ local->stub->args.mknod.loc.inode,
+ &local->buf);
+ call_stub_destroy (stub);
+ }
+ return 0;
+}
+
+int32_t
+ha_mknod_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ char *stateino = NULL;
+ int child_count = 0, i = 0, cnt = 0, ret = 0;
+ call_frame_t *prev_frame = NULL;
+ xlator_t **children = NULL;
+ uint64_t tmp_stateino = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+ prev_frame = cookie;
+ children = pvt->children;
+
+ for (i = 0; i < child_count; i++)
+ if (prev_frame->this == children[i])
+ break;
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mknod.loc.path, op_ret, op_errno);
+ }
+
+ ret = inode_ctx_get (local->stub->args.mknod.loc.inode,
+ this, &tmp_stateino);
+ stateino = (char *)(long)tmp_stateino;
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "inode_ctx_get() error");
+ /* FIXME: handle the case */
+ }
+ if (op_ret == 0) {
+ stateino[i] = 1;
+ local->op_ret = 0;
+ local->first_success = 1;
+ local->buf = *buf;
+ }
+ cnt = --local->call_count;
+ for (i = local->active + 1; i < child_count; i++) {
+ if (local->state[i])
+ break;
+ }
+
+ if (cnt == 0 || i == child_count) {
+ call_stub_t *stub = local->stub;
+ FREE (local->state);
+ stub = local->stub;
+ STACK_UNWIND (frame, local->op_ret, local->op_errno, local->stub->args.mknod.loc.inode, &local->buf);
+ call_stub_destroy (stub);
+ return 0;
+ }
+
+ local->active = i;
+
+ if (local->first_success == 0) {
+ STACK_WIND (frame,
+ ha_mknod_cbk,
+ children[i],
+ children[i]->fops->mknod,
+ &local->stub->args.mknod.loc,
+ local->stub->args.mknod.mode,
+ local->stub->args.mknod.rdev);
+ return 0;
+ }
+ cnt = local->call_count;
+
+ for (; i < child_count; i++) {
+ if (local->state[i]) {
+ STACK_WIND (frame,
+ ha_mknod_lookup_cbk,
+ children[i],
+ children[i]->fops->lookup,
+ &local->stub->args.mknod.loc,
+ 0);
+ if (--cnt == 0)
+ break;
+ }
+ }
+ return 0;
+}
+
+int32_t
+ha_mknod (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ mode_t mode,
+ dev_t rdev)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ int child_count = 0, i = 0;
+ char *stateino = NULL;
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+
+ frame->local = local = CALLOC (1, sizeof (*local));
+ local->stub = fop_mknod_stub (frame, ha_mknod, loc, mode, rdev);
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ local->state = CALLOC (1, child_count);
+ memcpy (local->state, pvt->state, child_count);
+ local->active = -1;
+
+ stateino = CALLOC (1, child_count);
+ inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino);
+
+ for (i = 0; i < child_count; i++) {
+ if (local->state[i]) {
+ local->call_count++;
+ if (local->active == -1)
+ local->active = i;
+ }
+ }
+
+ STACK_WIND (frame,
+ ha_mknod_cbk,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->mknod,
+ loc, mode, rdev);
+ return 0;
+}
+
+
+int
+ha_mkdir_lookup_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf,
+ dict_t *dict)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ char *stateino = NULL;
+ int child_count = 0, i = 0, cnt = 0;
+ call_frame_t *prev_frame = NULL;
+ xlator_t **children = NULL;
+ uint64_t tmp_stateino = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+ prev_frame = cookie;
+ children = pvt->children;
+
+ for (i = 0; i < child_count; i++)
+ if (prev_frame->this == children[i])
+ break;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mkdir.loc.path, op_ret, op_errno);
+ }
+ inode_ctx_get (local->stub->args.mkdir.loc.inode,
+ this, &tmp_stateino);
+ stateino = (char *)(long)tmp_stateino;
+
+ if (op_ret == 0)
+ stateino[i] = 1;
+
+ LOCK (&frame->lock);
+ cnt = --local->call_count;
+ UNLOCK (&frame->lock);
+
+ if (cnt == 0) {
+ call_stub_t *stub = local->stub;
+ FREE (local->state);
+ STACK_UNWIND (frame,
+ local->op_ret,
+ local->op_errno,
+ local->stub->args.mkdir.loc.inode,
+ &local->buf);
+ call_stub_destroy (stub);
+ }
+ return 0;
+}
+
+int32_t
+ha_mkdir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ char *stateino = NULL;
+ int child_count = 0, i = 0, cnt = 0;
+ call_frame_t *prev_frame = NULL;
+ xlator_t **children = NULL;
+ uint64_t tmp_stateino = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+ prev_frame = cookie;
+ children = pvt->children;
+
+ for (i = 0; i < child_count; i++)
+ if (prev_frame->this == children[i])
+ break;
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mkdir.loc.path, op_ret, op_errno);
+ }
+
+ inode_ctx_get (local->stub->args.mkdir.loc.inode,
+ this, &tmp_stateino);
+ stateino = (char *)(long)tmp_stateino;
+
+ if (op_ret == 0) {
+ stateino[i] = 1;
+ local->op_ret = 0;
+ local->first_success = 1;
+ local->buf = *buf;
+ }
+ cnt = --local->call_count;
+ for (i = local->active + 1; i < child_count; i++) {
+ if (local->state[i])
+ break;
+ }
+
+ if (cnt == 0 || i == child_count) {
+ call_stub_t *stub = local->stub;
+ FREE (local->state);
+ stub = local->stub;
+ STACK_UNWIND (frame, local->op_ret, local->op_errno, local->stub->args.mkdir.loc.inode, &local->buf);
+ call_stub_destroy (stub);
+ return 0;
+ }
+
+ local->active = i;
+
+ if (local->first_success == 0) {
+ STACK_WIND (frame,
+ ha_mkdir_cbk,
+ children[i],
+ children[i]->fops->mkdir,
+ &local->stub->args.mkdir.loc,
+ local->stub->args.mkdir.mode);
+ return 0;
+ }
+ cnt = local->call_count;
+
+ for (; i < child_count; i++) {
+ if (local->state[i]) {
+ STACK_WIND (frame,
+ ha_mkdir_lookup_cbk,
+ children[i],
+ children[i]->fops->lookup,
+ &local->stub->args.mkdir.loc,
+ 0);
+ if (--cnt == 0)
+ break;
+ }
+ }
+ return 0;
+}
+
+int32_t
+ha_mkdir (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ mode_t mode)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ int child_count = 0, i = 0;
+ char *stateino = NULL;
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+
+ frame->local = local = CALLOC (1, sizeof (*local));
+ local->stub = fop_mkdir_stub (frame, ha_mkdir, loc, mode);
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ local->state = CALLOC (1, child_count);
+ memcpy (local->state, pvt->state, child_count);
+ local->active = -1;
+
+ stateino = CALLOC (1, child_count);
+ inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino);
+ for (i = 0; i < child_count; i++) {
+ if (local->state[i]) {
+ local->call_count++;
+ if (local->active == -1)
+ local->active = i;
+ }
+ }
+
+ STACK_WIND (frame,
+ ha_mkdir_cbk,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->mkdir,
+ loc, mode);
+ return 0;
+}
+
+ int32_t
+ha_unlink_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+ if (ret == 0) {
+ STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+int32_t
+ha_unlink (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_unlink_stub (frame, ha_unlink, loc);
+
+ STACK_WIND_COOKIE (frame,
+ ha_unlink_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->unlink,
+ loc);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno);
+ return 0;
+}
+
+ int32_t
+ha_rmdir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno);
+ }
+ return 0;
+}
+
+int32_t
+ha_rmdir (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ ha_local_t *local = frame->local;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_rmdir_stub (frame, ha_rmdir, loc);
+
+ STACK_WIND_COOKIE (frame,
+ ha_rmdir_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->rmdir,
+ loc);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno);
+ return 0;
+}
+
+
+int
+ha_symlink_lookup_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf,
+ dict_t *dict)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ char *stateino = NULL;
+ int child_count = 0, i = 0, cnt = 0;
+ call_frame_t *prev_frame = NULL;
+ xlator_t **children = NULL;
+ uint64_t tmp_stateino = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+ prev_frame = cookie;
+ children = pvt->children;
+
+ for (i = 0; i < child_count; i++)
+ if (prev_frame->this == children[i])
+ break;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.symlink.loc.path, op_ret, op_errno);
+ }
+ inode_ctx_get (local->stub->args.symlink.loc.inode,
+ this, &tmp_stateino);
+ stateino = (char *)(long)tmp_stateino;
+
+ if (op_ret == 0)
+ stateino[i] = 1;
+
+ LOCK (&frame->lock);
+ cnt = --local->call_count;
+ UNLOCK (&frame->lock);
+
+ if (cnt == 0) {
+ call_stub_t *stub = local->stub;
+ FREE (local->state);
+ STACK_UNWIND (frame,
+ local->op_ret,
+ local->op_errno,
+ local->stub->args.symlink.loc.inode,
+ &local->buf);
+ call_stub_destroy (stub);
+ }
+ return 0;
+}
+
+int32_t
+ha_symlink_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ char *stateino = NULL;
+ int child_count = 0, i = 0, cnt = 0;
+ call_frame_t *prev_frame = NULL;
+ xlator_t **children = NULL;
+ uint64_t tmp_stateino = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+ prev_frame = cookie;
+ children = pvt->children;
+
+ for (i = 0; i < child_count; i++)
+ if (prev_frame->this == children[i])
+ break;
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.symlink.loc.path, op_ret, op_errno);
+ }
+ inode_ctx_get (local->stub->args.symlink.loc.inode,
+ this, &tmp_stateino);
+ stateino = (char *)(long)tmp_stateino;
+
+ if (op_ret == 0) {
+ stateino[i] = 1;
+ local->op_ret = 0;
+ local->first_success = 1;
+ local->buf = *buf;
+ }
+ cnt = --local->call_count;
+ for (i = local->active + 1; i < child_count; i++) {
+ if (local->state[i])
+ break;
+ }
+
+ if (cnt == 0 || i == child_count) {
+ call_stub_t *stub = local->stub;
+ FREE (local->state);
+ stub = local->stub;
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->stub->args.symlink.loc.inode, &local->buf);
+ call_stub_destroy (stub);
+ return 0;
+ }
+
+ local->active = i;
+
+ if (local->first_success == 0) {
+ STACK_WIND (frame,
+ ha_symlink_cbk,
+ children[i],
+ children[i]->fops->symlink,
+ local->stub->args.symlink.linkname,
+ &local->stub->args.symlink.loc);
+ return 0;
+ }
+ cnt = local->call_count;
+
+ for (; i < child_count; i++) {
+ if (local->state[i]) {
+ STACK_WIND (frame,
+ ha_symlink_lookup_cbk,
+ children[i],
+ children[i]->fops->lookup,
+ &local->stub->args.symlink.loc,
+ 0);
+ if (--cnt == 0)
+ break;
+ }
+ }
+ return 0;
+}
+
+int32_t
+ha_symlink (call_frame_t *frame,
+ xlator_t *this,
+ const char *linkname,
+ loc_t *loc)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ int child_count = 0, i = 0;
+ char *stateino = NULL;
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+
+ frame->local = local = CALLOC (1, sizeof (*local));
+ local->stub = fop_symlink_stub (frame, ha_symlink, linkname, loc);
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ local->state = CALLOC (1, child_count);
+ memcpy (local->state, pvt->state, child_count);
+ local->active = -1;
+
+ stateino = CALLOC (1, child_count);
+ inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino);
+
+ for (i = 0; i < child_count; i++) {
+ if (local->state[i]) {
+ local->call_count++;
+ if (local->active == -1) {
+ local->active = i;
+ }
+ }
+ }
+
+ STACK_WIND (frame,
+ ha_symlink_cbk,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->symlink,
+ linkname, loc);
+ return 0;
+}
+
+ int32_t
+ha_rename_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame, op_ret, op_errno, buf);
+ }
+ return 0;
+}
+
+int32_t
+ha_rename (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *oldloc,
+ loc_t *newloc)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_inode (frame, oldloc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_rename_stub (frame, ha_rename, oldloc, newloc);
+ STACK_WIND_COOKIE (frame,
+ ha_rename_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->rename,
+ oldloc, newloc);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+ return 0;
+}
+
+int
+ha_link_lookup_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf,
+ dict_t *dict)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ char *stateino = NULL;
+ int child_count = 0, i = 0, cnt = 0;
+ call_frame_t *prev_frame = NULL;
+ xlator_t **children = NULL;
+ uint64_t tmp_stateino = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+ prev_frame = cookie;
+ children = pvt->children;
+
+ for (i = 0; i < child_count; i++)
+ if (prev_frame->this == children[i])
+ break;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.link.newloc.path, op_ret, op_errno);
+ }
+ inode_ctx_get (local->stub->args.link.newloc.inode,
+ this, &tmp_stateino);
+ stateino = (char *)(long)tmp_stateino;
+
+ if (op_ret == 0)
+ stateino[i] = 1;
+
+ LOCK (&frame->lock);
+ cnt = --local->call_count;
+ UNLOCK (&frame->lock);
+
+ if (cnt == 0) {
+ call_stub_t *stub = local->stub;
+ FREE (local->state);
+ STACK_UNWIND (frame,
+ local->op_ret,
+ local->op_errno,
+ local->stub->args.link.oldloc.inode,
+ &local->buf);
+ call_stub_destroy (stub);
+ }
+ return 0;
+}
+
+int32_t
+ha_link_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ char *stateino = NULL;
+ int child_count = 0, i = 0, cnt = 0;
+ call_frame_t *prev_frame = NULL;
+ xlator_t **children = NULL;
+ uint64_t tmp_stateino = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+ prev_frame = cookie;
+ children = pvt->children;
+
+ for (i = 0; i < child_count; i++)
+ if (prev_frame->this == children[i])
+ break;
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.link.newloc.path, op_ret, op_errno);
+ }
+ inode_ctx_get (local->stub->args.link.newloc.inode,
+ this, &tmp_stateino);
+ stateino = (char *)(long)tmp_stateino;
+
+ if (op_ret == 0) {
+ stateino[i] = 1;
+ local->op_ret = 0;
+ local->first_success = 1;
+ local->buf = *buf;
+ }
+ cnt = --local->call_count;
+ for (i = local->active + 1; i < child_count; i++) {
+ if (local->state[i])
+ break;
+ }
+
+ if (cnt == 0 || i == child_count) {
+ call_stub_t *stub = local->stub;
+ FREE (local->state);
+ stub = local->stub;
+ STACK_UNWIND (frame, local->op_ret, local->op_errno, local->stub->args.link.oldloc.inode, &local->buf);
+ call_stub_destroy (stub);
+ return 0;
+ }
+
+ local->active = i;
+
+ if (local->first_success == 0) {
+ STACK_WIND (frame,
+ ha_link_cbk,
+ children[i],
+ children[i]->fops->link,
+ &local->stub->args.link.oldloc,
+ &local->stub->args.link.newloc);
+ return 0;
+ }
+ cnt = local->call_count;
+
+ for (; i < child_count; i++) {
+ if (local->state[i]) {
+ STACK_WIND (frame,
+ ha_link_lookup_cbk,
+ children[i],
+ children[i]->fops->lookup,
+ &local->stub->args.link.newloc,
+ 0);
+ if (--cnt == 0)
+ break;
+ }
+ }
+ return 0;
+}
+
+int32_t
+ha_link (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *oldloc,
+ loc_t *newloc)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ int child_count = 0, i = 0;
+ char *stateino = NULL;
+ int32_t ret = 0;
+ uint64_t tmp_stateino = 0;
+
+ ret = inode_ctx_get (newloc->inode, this, &tmp_stateino);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()");
+ }
+ stateino = (char *)(long)tmp_stateino;
+
+ if (stateino == NULL) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "newloc->inode's ctx is NULL, returning EINVAL");
+ STACK_UNWIND (frame, -1, EINVAL, oldloc->inode, NULL);
+ return 0;
+ }
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+
+ frame->local = local = CALLOC (1, sizeof (*local));
+ local->stub = fop_link_stub (frame, ha_link, oldloc, newloc);
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ local->state = CALLOC (1, child_count);
+ memcpy (local->state, pvt->state, child_count);
+ local->active = -1;
+
+ for (i = 0; i < child_count; i++) {
+ if (local->state[i]) {
+ local->call_count++;
+ if (local->active == -1)
+ local->active = i;
+ }
+ }
+
+ STACK_WIND (frame,
+ ha_link_cbk,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->link,
+ oldloc,
+ newloc);
+ return 0;
+}
+
+int32_t
+ha_create_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd,
+ inode_t *inode,
+ struct stat *buf)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ int i, child_count = 0, cnt = 0, ret = 0;
+ char *stateino = NULL;
+ hafd_t *hafdp = NULL;
+ call_frame_t *prev_frame = NULL;
+ xlator_t **children = NULL;
+ uint64_t tmp_stateino = 0;
+ uint64_t tmp_hafdp = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+ prev_frame = cookie;
+ children = pvt->children;
+
+ ret = inode_ctx_get (local->stub->args.create.loc.inode,
+ this, &tmp_stateino);
+ stateino = (char *)(long)tmp_stateino;
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "dict_to_ptr() error");
+ /* FIXME: handle */
+ }
+ ret = fd_ctx_get (local->stub->args.create.fd, this, &tmp_hafdp);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "dict_to_ptr() error");
+ /* FIXME: handle */
+ }
+ hafdp = (hafd_t *)(long)tmp_hafdp;
+
+ for (i = 0; i < child_count; i++) {
+ if (prev_frame->this == children[i])
+ break;
+ }
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.create.loc.path, op_ret, op_errno);
+ }
+ if (op_ret != -1) {
+ stateino[i] = 1;
+ hafdp->fdstate[i] = 1;
+ if (local->op_ret == -1) {
+ local->op_ret = 0;
+ local->buf = *buf;
+ local->first_success = 1;
+ }
+ local->stub->args.create.flags &= (~O_EXCL);
+ }
+ LOCK (&frame->lock);
+ cnt = --local->call_count;
+ UNLOCK (&frame->lock);
+
+ for (i = local->active + 1; i < child_count; i++) {
+ if (local->state[i])
+ break;
+ }
+
+ if (cnt == 0 || i == child_count) {
+ char *state = local->state;
+ call_stub_t *stub = local->stub;
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ stub->args.create.fd,
+ stub->args.create.loc.inode, &local->buf);
+ FREE (state);
+ call_stub_destroy (stub);
+ return 0;
+ }
+ local->active = i;
+ cnt = local->call_count;
+ for (; i < child_count; i++) {
+ if (local->state[i]) {
+ STACK_WIND (frame,
+ ha_create_cbk,
+ children[i],
+ children[i]->fops->create,
+ &local->stub->args.create.loc,
+ local->stub->args.create.flags,
+ local->stub->args.create.mode,
+ local->stub->args.create.fd);
+ if ((local->first_success == 0) || (cnt == 0))
+ break;
+ }
+ }
+ return 0;
+}
+
+int32_t
+ha_create (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags,
+ mode_t mode, fd_t *fd)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ int i, child_count = 0;
+ char *stateino = NULL;
+ xlator_t **children = NULL;
+ hafd_t *hafdp = NULL;
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+ children = pvt->children;
+
+ if (local == NULL) {
+ local = frame->local = CALLOC (1, sizeof (*local));
+ local->stub = fop_create_stub (frame, ha_create, loc, flags, mode, fd);
+ local->state = CALLOC (1, child_count);
+ local->active = -1;
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ memcpy (local->state, pvt->state, child_count);
+
+ for (i = 0; i < pvt->child_count; i++) {
+ if (local->state[i]) {
+ local->call_count++;
+ if (local->active == -1)
+ local->active = i;
+ }
+ }
+ /* FIXME handle active -1 */
+ stateino = CALLOC (1, child_count);
+ hafdp = CALLOC (1, sizeof (*hafdp));
+ hafdp->fdstate = CALLOC (1, child_count);
+ hafdp->path = strdup(loc->path);
+ LOCK_INIT (&hafdp->lock);
+ fd_ctx_set (fd, this, (uint64_t)(long)hafdp);
+ inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino);
+ }
+
+ STACK_WIND (frame,
+ ha_create_cbk,
+ children[local->active],
+ children[local->active]->fops->create,
+ loc, flags, mode, fd);
+ return 0;
+}
+
+ int32_t
+ha_open_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ xlator_t **children = NULL;
+ int i = 0, child_count = 0, callcnt = 0, ret = 0;
+ call_frame_t *prev_frame = NULL;
+ hafd_t *hafdp = NULL;
+ uint64_t tmp_hafdp = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ children = pvt->children;
+ child_count = pvt->child_count;
+ prev_frame = cookie;
+
+ ret = fd_ctx_get (local->fd, this, &tmp_hafdp);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()");
+ }
+ hafdp = (hafd_t *)(long)tmp_hafdp;
+
+ for (i = 0; i < child_count; i++)
+ if (children[i] == prev_frame->this)
+ break;
+ LOCK (&frame->lock);
+ if (op_ret != -1) {
+ hafdp->fdstate[i] = 1;
+ local->op_ret = 0;
+ }
+ if (op_ret == -1 && op_errno != ENOTCONN)
+ local->op_errno = op_errno;
+ callcnt = --local->call_count;
+ UNLOCK (&frame->lock);
+
+ if (callcnt == 0) {
+ STACK_UNWIND (frame,
+ local->op_ret,
+ local->op_errno,
+ local->fd);
+ }
+ return 0;
+}
+
+int32_t
+ha_open (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags, fd_t *fd)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ char *stateino = NULL;
+ xlator_t **children = NULL;
+ int cnt = 0, i, child_count = 0, ret = 0;
+ hafd_t *hafdp = NULL;
+ uint64_t tmp_stateino = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ children = pvt->children;
+ child_count = pvt->child_count;
+
+
+ local = frame->local = CALLOC (1, sizeof (*local));
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ local->fd = fd;
+
+ hafdp = CALLOC (1, sizeof (*hafdp));
+ hafdp->fdstate = CALLOC (1, child_count);
+ hafdp->path = strdup (loc->path);
+ hafdp->active = -1;
+ if (pvt->pref_subvol == -1) {
+ hafdp->active = fd->inode->ino % child_count;
+ }
+
+ LOCK_INIT (&hafdp->lock);
+ fd_ctx_set (fd, this, (uint64_t)(long)hafdp);
+ ret = inode_ctx_get (loc->inode, this, &tmp_stateino);
+ stateino = (char *)(long)tmp_stateino;
+
+ for (i = 0; i < child_count; i++)
+ if (stateino[i])
+ cnt++;
+ local->call_count = cnt;
+ for (i = 0; i < child_count; i++) {
+ if (stateino[i]) {
+ STACK_WIND (frame,
+ ha_open_cbk,
+ children[i],
+ children[i]->fops->open,
+ loc, flags, fd);
+ if (--cnt == 0)
+ break;
+ }
+ }
+ return 0;
+}
+
+ int32_t
+ha_readv_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vector,
+ int32_t count,
+ struct stat *stbuf)
+{
+ int ret = 0;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ vector,
+ count,
+ stbuf);
+ }
+ return 0;
+}
+
+int32_t
+ha_readv (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t offset)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_fd (frame, fd);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_readv_stub (frame, ha_readv, fd, size, offset);
+
+ STACK_WIND_COOKIE (frame,
+ ha_readv_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->readv,
+ fd,
+ size,
+ offset);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+ return 0;
+}
+
+ int32_t
+ha_writev_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *stbuf)
+{
+ int ret = 0;
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ stbuf);
+ }
+ return 0;
+}
+
+int32_t
+ha_writev (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ struct iovec *vector,
+ int32_t count,
+ off_t off)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_fd (frame, fd);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_writev_stub (frame, ha_writev, fd, vector, count, off);
+
+ STACK_WIND_COOKIE (frame,
+ ha_writev_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->writev,
+ fd,
+ vector,
+ count,
+ off);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+ return 0;
+}
+
+ int32_t
+ha_flush_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int ret = 0;
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno);
+ }
+ return 0;
+}
+
+int32_t
+ha_flush (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_fd (frame, fd);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_flush_stub (frame, ha_flush, fd);
+ STACK_WIND_COOKIE (frame,
+ ha_flush_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->flush,
+ fd);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno);
+ return 0;
+}
+
+
+ int32_t
+ha_fsync_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int ret = 0;
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno);
+ }
+ return 0;
+}
+
+int32_t
+ha_fsync (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t flags)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_fd (frame, fd);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_fsync_stub (frame, ha_fsync, fd, flags);
+ STACK_WIND_COOKIE (frame,
+ ha_fsync_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->fsync,
+ fd,
+ flags);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno);
+ return 0;
+}
+
+ int32_t
+ha_fstat_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ int ret = 0;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ buf);
+ }
+ return 0;
+}
+
+int32_t
+ha_fstat (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_fd (frame, fd);
+
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_fstat_stub (frame, ha_fstat, fd);
+ STACK_WIND_COOKIE (frame,
+ ha_fstat_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->fstat,
+ fd);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+ return 0;
+}
+
+int32_t
+ha_opendir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ xlator_t **children = NULL;
+ int i = 0, child_count = 0, callcnt = 0, ret = 0;
+ call_frame_t *prev_frame = NULL;
+ hafd_t *hafdp = NULL;
+ uint64_t tmp_hafdp = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ children = pvt->children;
+ child_count = pvt->child_count;
+ prev_frame = cookie;
+
+ ret = fd_ctx_get (local->fd, this, &tmp_hafdp);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()");
+ }
+ hafdp = (hafd_t *)(long)tmp_hafdp;
+
+ for (i = 0; i < child_count; i++)
+ if (children[i] == prev_frame->this)
+ break;
+ LOCK (&frame->lock);
+ if (op_ret != -1) {
+ hafdp->fdstate[i] = 1;
+ local->op_ret = 0;
+ }
+ if (op_ret == -1 && op_errno != ENOTCONN)
+ local->op_errno = op_errno;
+ callcnt = --local->call_count;
+ UNLOCK (&frame->lock);
+
+ if (callcnt == 0) {
+ STACK_UNWIND (frame,
+ local->op_ret,
+ local->op_errno,
+ local->fd);
+ }
+ return 0;
+}
+
+int32_t
+ha_opendir (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc, fd_t *fd)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ char *stateino = NULL;
+ xlator_t **children = NULL;
+ int cnt = 0, i, child_count = 0, ret = 0;
+ hafd_t *hafdp = NULL;
+ uint64_t tmp_stateino = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ children = pvt->children;
+ child_count = pvt->child_count;
+
+ local = frame->local = CALLOC (1, sizeof (*local));
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ local->fd = fd;
+
+ hafdp = CALLOC (1, sizeof (*hafdp));
+ hafdp->fdstate = CALLOC (1, child_count);
+ hafdp->path = strdup (loc->path);
+ LOCK_INIT (&hafdp->lock);
+ fd_ctx_set (fd, this, (uint64_t)(long)hafdp);
+ ret = inode_ctx_get (loc->inode, this, &tmp_stateino);
+ stateino = (char *)(long)tmp_stateino;
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "inode_ctx_get() error");
+ }
+ for (i = 0; i < child_count; i++)
+ if (stateino[i])
+ cnt++;
+ local->call_count = cnt;
+ for (i = 0; i < child_count; i++) {
+ if (stateino[i]) {
+ STACK_WIND (frame,
+ ha_opendir_cbk,
+ children[i],
+ children[i]->fops->opendir,
+ loc, fd);
+ if (--cnt == 0)
+ break;
+ }
+ }
+ return 0;
+}
+
+ int32_t
+ha_getdents_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dir_entry_t *entries,
+ int32_t count)
+{
+ int ret = 0;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ entries,
+ count);
+ }
+ return 0;
+}
+
+int32_t
+ha_getdents (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t offset,
+ int32_t flag)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_fd (frame, fd);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_getdents_stub (frame, ha_getdents, fd, size, offset, flag);
+ STACK_WIND_COOKIE (frame,
+ ha_getdents_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->getdents,
+ fd,
+ size,
+ offset,
+ flag);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, 0);
+ return 0;
+}
+
+ int32_t
+ha_setdents_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int ret = 0;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno);
+ }
+ return 0;
+}
+
+int32_t
+ha_setdents (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t flags,
+ dir_entry_t *entries,
+ int32_t count)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_fd (frame, fd);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+
+ local->stub = fop_setdents_stub (frame, ha_setdents, fd, flags, entries, count);
+
+ STACK_WIND_COOKIE (frame,
+ ha_setdents_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->setdents,
+ fd,
+ flags,
+ entries,
+ count);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno);
+ return 0;
+}
+
+ int32_t
+ha_fsyncdir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int ret = 0;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno);
+ }
+ return 0;
+}
+
+int32_t
+ha_fsyncdir (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t flags)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_fd (frame, fd);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_fsyncdir_stub (frame, ha_fsyncdir, fd, flags);
+ STACK_WIND_COOKIE (frame,
+ ha_fsyncdir_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->fsyncdir,
+ fd,
+ flags);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno);
+ return 0;
+}
+
+
+ int32_t
+ha_statfs_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct statvfs *buf)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ buf);
+ }
+ return 0;
+}
+
+int32_t
+ha_statfs (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+
+ local->stub = fop_statfs_stub (frame, ha_statfs, loc);
+ STACK_WIND_COOKIE (frame,
+ ha_statfs_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->statfs,
+ loc);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+ return 0;
+}
+
+ int32_t
+ha_setxattr_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno);
+ }
+ return 0;
+}
+
+int32_t
+ha_setxattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ dict_t *dict,
+ int32_t flags)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_setxattr_stub (frame, ha_setxattr, loc, dict, flags);
+ STACK_WIND_COOKIE (frame,
+ ha_setxattr_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->setxattr,
+ loc,
+ dict,
+ flags);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno);
+ return 0;
+}
+
+ int32_t
+ha_getxattr_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ dict);
+ }
+ return 0;
+}
+
+int32_t
+ha_getxattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ const char *name)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_getxattr_stub (frame, ha_getxattr, loc, name);
+ STACK_WIND_COOKIE (frame,
+ ha_getxattr_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->getxattr,
+ loc,
+ name);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+ return 0;
+}
+
+int32_t
+ha_xattrop_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict)
+{
+ int ret = -1;
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+ if (ret == 0) {
+ STACK_UNWIND (frame, op_ret, op_errno, dict);
+ }
+ return 0;
+}
+
+
+int32_t
+ha_xattrop (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ gf_xattrop_flags_t flags,
+ dict_t *dict)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+
+ local->stub = fop_xattrop_stub (frame, ha_xattrop, loc, flags, dict);
+
+ STACK_WIND_COOKIE (frame,
+ ha_xattrop_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->xattrop,
+ loc,
+ flags,
+ dict);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, dict);
+ return 0;
+}
+
+int32_t
+ha_fxattrop_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict)
+{
+ int ret = -1;
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+ if (ret == 0)
+ STACK_UNWIND (frame, op_ret, op_errno, dict);
+ return 0;
+}
+
+int32_t
+ha_fxattrop (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ gf_xattrop_flags_t flags,
+ dict_t *dict)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_fd (frame, fd);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_fxattrop_stub (frame, ha_fxattrop, fd, flags, dict);
+
+ STACK_WIND_COOKIE (frame,
+ ha_fxattrop_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->fxattrop,
+ fd,
+ flags,
+ dict);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, dict);
+ return 0;
+}
+
+ int32_t
+ha_removexattr_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int ret = -1;
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno);
+ }
+ return 0;
+}
+
+int32_t
+ha_removexattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ const char *name)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+
+ local->stub = fop_removexattr_stub (frame, ha_removexattr, loc, name);
+
+ STACK_WIND_COOKIE (frame,
+ ha_removexattr_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->removexattr,
+ loc,
+ name);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno);
+ return 0;
+}
+
+int32_t
+ha_lk_setlk_unlck_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct flock *lock)
+{
+ ha_local_t *local = NULL;
+ int cnt = 0;
+ call_stub_t *stub = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ cnt = --local->call_count;
+ if (op_ret == 0)
+ local->op_ret = 0;
+ UNLOCK (&frame->lock);
+
+ if (cnt == 0) {
+ stub = local->stub;
+ FREE (local->state);
+ if (stub->args.lk.lock.l_type == F_UNLCK) {
+ STACK_UNWIND (frame, local->op_ret, local->op_errno, &stub->args.lk.lock);
+ } else {
+ STACK_UNWIND (frame, -1, EIO, NULL);
+ }
+ call_stub_destroy (stub);
+ }
+ return 0;
+}
+
+int32_t
+ha_lk_setlk_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct flock *lock)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ xlator_t **children = NULL;
+ int i = 0, cnt = 0, j = 0;
+ int child_count = 0;
+ call_frame_t *prev_frame = NULL;
+ char *state = NULL;
+
+ local = frame->local;
+ pvt = this->private;
+ children = pvt->children;
+ child_count = pvt->child_count;
+ prev_frame = cookie;
+ state = local->state;
+
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ if ((op_ret == 0) || (op_ret == -1 && op_errno == ENOTCONN)) {
+ for (i = 0; i < child_count; i++) {
+ if (prev_frame->this == cookie)
+ break;
+ }
+ i++;
+ for (; i < child_count; i++) {
+ if (local->state[i])
+ break;
+ }
+ if (i == child_count) {
+ call_stub_t *stub = local->stub;
+ FREE (local->state);
+ STACK_UNWIND (frame, 0, op_errno, &stub->args.lk.lock);
+ call_stub_destroy (stub);
+ return 0;
+ }
+ STACK_WIND (frame,
+ ha_lk_setlk_cbk,
+ children[i],
+ children[i]->fops->lk,
+ local->stub->args.lk.fd,
+ local->stub->args.lk.cmd,
+ &local->stub->args.lk.lock);
+ return 0;
+ } else {
+ for (i = 0; i < child_count; i++) {
+ if (prev_frame->this == cookie)
+ break;
+ }
+ cnt = 0;
+ for (j = 0; j < i; j++) {
+ if (state[i])
+ cnt++;
+ }
+ if (cnt) {
+ struct flock lock;
+ lock = local->stub->args.lk.lock;
+ for (i = 0; i < child_count; i++) {
+ if (state[i]) {
+ STACK_WIND (frame,
+ ha_lk_setlk_unlck_cbk,
+ children[i],
+ children[i]->fops->lk,
+ local->stub->args.lk.fd,
+ local->stub->args.lk.cmd,
+ &lock);
+ if (--cnt == 0)
+ break;
+ }
+ }
+ return 0;
+ } else {
+ FREE (local->state);
+ call_stub_destroy (local->stub);
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ lock);
+ return 0;
+ }
+ }
+}
+
+int32_t
+ha_lk_getlk_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct flock *lock)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ fd_t *fd = NULL;
+ int child_count = 0, i = 0;
+ xlator_t **children = NULL;
+ call_frame_t *prev_frame = NULL;
+
+ local = frame->local;
+ pvt = this->private;
+ fd = local->stub->args.lk.fd;
+ child_count = pvt->child_count;
+ children = pvt->children;
+ prev_frame = cookie;
+
+ if (op_ret == 0) {
+ FREE (local->state);
+ call_stub_destroy (local->stub);
+ STACK_UNWIND (frame, 0, 0, lock);
+ return 0;
+ }
+
+ for (i = 0; i < child_count; i++) {
+ if (prev_frame->this == children[i])
+ break;
+ }
+
+ for (; i < child_count; i++) {
+ if (local->state[i])
+ break;
+ }
+
+ if (i == child_count) {
+ FREE (local->state);
+ call_stub_destroy (local->stub);
+ STACK_UNWIND (frame, op_ret, op_errno, lock);
+ return 0;
+ }
+
+ STACK_WIND (frame,
+ ha_lk_getlk_cbk,
+ children[i],
+ children[i]->fops->lk,
+ fd,
+ local->stub->args.lk.cmd,
+ &local->stub->args.lk.lock);
+ return 0;
+}
+
+int32_t
+ha_lk (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t cmd,
+ struct flock *lock)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ hafd_t *hafdp = NULL;
+ char *state = NULL;
+ int child_count = 0, i = 0, cnt = 0, ret = 0;
+ xlator_t **children = NULL;
+ uint64_t tmp_hafdp = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ child_count = pvt->child_count;
+ children = pvt->children;
+ ret = fd_ctx_get (fd, this, &tmp_hafdp);
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_ERROR, "fd_ctx_get failed");
+
+ if (local == NULL) {
+ local = frame->local = CALLOC (1, sizeof (*local));
+ local->active = -1;
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ }
+ hafdp = (hafd_t *)(long)tmp_hafdp;
+
+ if (local->active == -1) {
+ STACK_UNWIND (frame, -1, ENOTCONN, NULL);
+ return 0;
+ }
+
+ local->stub = fop_lk_stub (frame, ha_lk, fd, cmd, lock);
+ local->state = CALLOC (1, child_count);
+ state = hafdp->fdstate;
+ LOCK (&hafdp->lock);
+ memcpy (local->state, state, child_count);
+ UNLOCK (&hafdp->lock);
+ if (cmd == F_GETLK) {
+ for (i = 0; i < child_count; i++) {
+ if (local->state[i])
+ break;
+ }
+ STACK_WIND (frame,
+ ha_lk_getlk_cbk,
+ children[i],
+ children[i]->fops->lk,
+ fd,
+ cmd,
+ lock);
+ } else if (cmd == F_SETLK && lock->l_type == F_UNLCK) {
+ for (i = 0; i < child_count; i++) {
+ if (local->state[i])
+ local->call_count++;
+ }
+ cnt = local->call_count;
+ for (i = 0; i < child_count; i++) {
+ if (local->state[i]) {
+ STACK_WIND (frame,
+ ha_lk_setlk_unlck_cbk,
+ children[i],
+ children[i]->fops->lk,
+ fd, cmd, lock);
+ if (--cnt == 0)
+ break;
+ }
+ }
+ } else {
+ for (i = 0; i < child_count; i++) {
+ if (local->state[i])
+ break;
+ }
+ STACK_WIND (frame,
+ ha_lk_setlk_cbk,
+ children[i],
+ children[i]->fops->lk,
+ fd,
+ cmd,
+ lock);
+ }
+ return 0;
+}
+
+ int32_t
+ha_inode_entry_lk_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno);
+ }
+ return 0;
+}
+
+int32_t
+ha_inodelk (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t cmd,
+ struct flock *lock)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_inodelk_stub (frame, ha_inodelk, loc, cmd, lock);
+ STACK_WIND_COOKIE (frame,
+ ha_inode_entry_lk_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->inodelk,
+ loc,
+ cmd,
+ lock);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno);
+ return 0;
+}
+
+int32_t
+ha_entrylk (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ const char *basename,
+ entrylk_cmd cmd,
+ entrylk_type type)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_entrylk_stub (frame, ha_entrylk, loc, basename, cmd, type);
+ STACK_WIND_COOKIE (frame,
+ ha_inode_entry_lk_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->entrylk,
+ loc, basename, cmd, type);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno);
+ return 0;
+}
+
+ int32_t
+ha_checksum_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ uint8_t *file_checksum,
+ uint8_t *dir_checksum)
+{
+ int ret = -1;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+ if (ret == 0) {
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ file_checksum,
+ dir_checksum);
+ }
+ return 0;
+}
+
+int32_t
+ha_checksum (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flag)
+{
+ int op_errno = 0;
+ ha_local_t *local = NULL;
+
+ op_errno = ha_alloc_init_inode (frame, loc->inode);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_checksum_stub (frame, ha_checksum, loc, flag);
+
+ STACK_WIND_COOKIE (frame,
+ ha_checksum_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->checksum,
+ loc,
+ flag);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
+
+int32_t
+ha_readdir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ gf_dirent_t *entries)
+{
+ int ret = 0;
+
+ ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
+ if (ret == 0)
+ STACK_UNWIND (frame, op_ret, op_errno, entries);
+ return 0;
+}
+
+int32_t
+ha_readdir (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t off)
+{
+ ha_local_t *local = NULL;
+ int op_errno = 0;
+
+ op_errno = ha_alloc_init_fd (frame, fd);
+ if (op_errno < 0) {
+ op_errno = -op_errno;
+ goto err;
+ }
+ local = frame->local;
+ local->stub = fop_readdir_stub (frame, ha_readdir, fd, size, off);
+ STACK_WIND_COOKIE (frame,
+ ha_readdir_cbk,
+ (void *)(long)local->active,
+ HA_ACTIVE_CHILD(this, local),
+ HA_ACTIVE_CHILD(this, local)->fops->readdir,
+ fd, size, off);
+ return 0;
+err:
+ STACK_UNWIND (frame, -1, ENOTCONN, NULL);
+ return 0;
+}
+
+/* Management operations */
+
+ int32_t
+ha_stats_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct xlator_stats *stats)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ call_frame_t *prev_frame = NULL;
+ xlator_t **children = NULL;
+ int i = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ prev_frame = cookie;
+ children = pvt->children;
+
+ if (op_ret == -1 && op_errno == ENOTCONN) {
+ for (i = 0; i < pvt->child_count; i++) {
+ if (prev_frame->this == children[i])
+ break;
+ }
+ i++;
+ for (; i < pvt->child_count; i++) {
+ if (pvt->state[i])
+ break;
+ }
+
+ if (i == pvt->child_count) {
+ STACK_UNWIND (frame, -1, ENOTCONN, NULL);
+ return 0;
+ }
+ STACK_WIND (frame,
+ ha_stats_cbk,
+ children[i],
+ children[i]->mops->stats,
+ local->flags);
+ return 0;
+ }
+
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ stats);
+ return 0;
+}
+
+int32_t
+ha_stats (call_frame_t *frame,
+ xlator_t *this,
+ int32_t flags)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ xlator_t **children = NULL;
+ int i = 0;
+
+ local = frame->local = CALLOC (1, sizeof (*local));
+ pvt = this->private;
+ children = pvt->children;
+ for (i = 0; i < pvt->child_count; i++) {
+ if (pvt->state[i])
+ break;
+ }
+
+ if (i == pvt->child_count) {
+ STACK_UNWIND (frame, -1, ENOTCONN, NULL);
+ return 0;
+ }
+ local->flags = flags;
+
+ STACK_WIND (frame,
+ ha_stats_cbk,
+ children[i],
+ children[i]->mops->stats,
+ flags);
+ return 0;
+}
+
+
+int32_t
+ha_getspec_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ char *spec_data)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ call_frame_t *prev_frame = NULL;
+ xlator_t **children = NULL;
+ int i = 0;
+
+ local = frame->local;
+ pvt = this->private;
+ prev_frame = cookie;
+ children = pvt->children;
+
+ if (op_ret == -1 && op_errno == ENOTCONN) {
+ for (i = 0; i < pvt->child_count; i++) {
+ if (prev_frame->this == children[i])
+ break;
+ }
+ i++;
+ for (; i < pvt->child_count; i++) {
+ if (pvt->state[i])
+ break;
+ }
+
+ if (i == pvt->child_count) {
+ STACK_UNWIND (frame, -1, ENOTCONN, NULL);
+ return 0;
+ }
+ STACK_WIND (frame,
+ ha_getspec_cbk,
+ children[i],
+ children[i]->mops->getspec,
+ local->pattern,
+ local->flags);
+ return 0;
+ }
+
+ STACK_UNWIND (frame,
+ op_ret,
+ op_errno,
+ spec_data);
+ return 0;
+}
+
+int32_t
+ha_getspec (call_frame_t *frame,
+ xlator_t *this,
+ const char *key,
+ int32_t flags)
+{
+ ha_local_t *local = NULL;
+ ha_private_t *pvt = NULL;
+ xlator_t **children = NULL;
+ int i = 0;
+
+ local = frame->local = CALLOC (1, sizeof (*local));
+ pvt = this->private;
+ children = pvt->children;
+
+ local = frame->local = CALLOC (1, sizeof (*local));
+ for (i = 0; i < pvt->child_count; i++) {
+ if (pvt->state[i])
+ break;
+ }
+
+ if (i == pvt->child_count) {
+ STACK_UNWIND (frame, -1, ENOTCONN, NULL);
+ return 0;
+ }
+ local->flags = flags;
+ local->pattern = (char *)key;
+
+ STACK_WIND (frame,
+ ha_getspec_cbk,
+ children[i],
+ children[i]->mops->getspec,
+ key, flags);
+ return 0;
+}
+
+int32_t
+ha_closedir (xlator_t *this,
+ fd_t *fd)
+{
+ hafd_t *hafdp = NULL;
+ int op_errno = 0;
+ uint64_t tmp_hafdp = 0;
+
+ op_errno = fd_ctx_del (fd, this, &tmp_hafdp);
+ if (op_errno != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "fd_ctx_del() error");
+ return 0;
+ }
+ hafdp = (hafd_t *)(long)tmp_hafdp;
+
+ FREE (hafdp->fdstate);
+ FREE (hafdp->path);
+ LOCK_DESTROY (&hafdp->lock);
+ return 0;
+}
+
+int32_t
+ha_close (xlator_t *this,
+ fd_t *fd)
+{
+ hafd_t *hafdp = NULL;
+ int op_errno = 0;
+ uint64_t tmp_hafdp = 0;
+
+ op_errno = fd_ctx_del (fd, this, &tmp_hafdp);
+ if (op_errno != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "fd_ctx_del() error");
+ return 0;
+ }
+ hafdp = (hafd_t *)(long)tmp_hafdp;
+
+ FREE (hafdp->fdstate);
+ FREE (hafdp->path);
+ LOCK_DESTROY (&hafdp->lock);
+ return 0;
+}
+
+/* notify */
+int32_t
+notify (xlator_t *this,
+ int32_t event,
+ void *data,
+ ...)
+{
+ ha_private_t *pvt = NULL;
+ int32_t i = 0, upcnt = 0;
+
+ pvt = this->private;
+ if (pvt == NULL) {
+ gf_log (this->name, GF_LOG_DEBUG, "got notify before init()");
+ return 0;
+ }
+
+ switch (event)
+ {
+ case GF_EVENT_CHILD_DOWN:
+ {
+ for (i = 0; i < pvt->child_count; i++) {
+ if (data == pvt->children[i])
+ break;
+ }
+ gf_log (this->name, GF_LOG_DEBUG, "GF_EVENT_CHILD_DOWN from %s", pvt->children[i]->name);
+ pvt->state[i] = 0;
+ for (i = 0; i < pvt->child_count; i++) {
+ if (pvt->state[i])
+ break;
+ }
+ if (i == pvt->child_count) {
+ default_notify (this, event, data);
+ }
+ }
+ break;
+ case GF_EVENT_CHILD_UP:
+ {
+ for (i = 0; i < pvt->child_count; i++) {
+ if (data == pvt->children[i])
+ break;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "GF_EVENT_CHILD_UP from %s", pvt->children[i]->name);
+
+ pvt->state[i] = 1;
+
+ for (i = 0; i < pvt->child_count; i++) {
+ if (pvt->state[i])
+ upcnt++;
+ }
+
+ if (upcnt == 1) {
+ default_notify (this, event, data);
+ }
+ }
+ break;
+
+ default:
+ {
+ default_notify (this, event, data);
+ }
+ }
+
+ return 0;
+}
+
+int
+init (xlator_t *this)
+{
+ ha_private_t *pvt = NULL;
+ xlator_list_t *trav = NULL;
+ int count = 0, ret = 0;
+
+ if (!this->children) {
+ gf_log (this->name,GF_LOG_ERROR,
+ "FATAL: ha should have one or more child defined");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ trav = this->children;
+ pvt = CALLOC (1, sizeof (ha_private_t));
+
+ ret = dict_get_int32 (this->options, "preferred-subvolume",
+ &pvt->pref_subvol);
+ if (ret < 0) {
+ pvt->pref_subvol = -1;
+ }
+
+ trav = this->children;
+ while (trav) {
+ count++;
+ trav = trav->next;
+ }
+
+ pvt->child_count = count;
+ pvt->children = CALLOC (count, sizeof (xlator_t*));
+
+ trav = this->children;
+ count = 0;
+ while (trav) {
+ pvt->children[count] = trav->xlator;
+ count++;
+ trav = trav->next;
+ }
+
+ pvt->state = CALLOC (1, count);
+ this->private = pvt;
+ return 0;
+}
+
+void
+fini (xlator_t *this)
+{
+ ha_private_t *priv = NULL;
+ priv = this->private;
+ FREE (priv);
+ return;
+}
+
+
+struct xlator_fops fops = {
+ .lookup = ha_lookup,
+ .stat = ha_stat,
+ .readlink = ha_readlink,
+ .mknod = ha_mknod,
+ .mkdir = ha_mkdir,
+ .unlink = ha_unlink,
+ .rmdir = ha_rmdir,
+ .symlink = ha_symlink,
+ .rename = ha_rename,
+ .link = ha_link,
+ .chmod = ha_chmod,
+ .chown = ha_chown,
+ .truncate = ha_truncate,
+ .utimens = ha_utimens,
+ .create = ha_create,
+ .open = ha_open,
+ .readv = ha_readv,
+ .writev = ha_writev,
+ .statfs = ha_statfs,
+ .flush = ha_flush,
+ .fsync = ha_fsync,
+ .setxattr = ha_setxattr,
+ .getxattr = ha_getxattr,
+ .removexattr = ha_removexattr,
+ .opendir = ha_opendir,
+ .readdir = ha_readdir,
+ .getdents = ha_getdents,
+ .fsyncdir = ha_fsyncdir,
+ .access = ha_access,
+ .ftruncate = ha_ftruncate,
+ .fstat = ha_fstat,
+ .lk = ha_lk,
+ .fchmod = ha_fchmod,
+ .fchown = ha_fchown,
+ .setdents = ha_setdents,
+ .lookup_cbk = ha_lookup_cbk,
+ .checksum = ha_checksum,
+ .xattrop = ha_xattrop,
+ .fxattrop = ha_fxattrop
+};
+
+struct xlator_mops mops = {
+ .stats = ha_stats,
+ .getspec = ha_getspec,
+};
+
+struct xlator_cbks cbks = {
+ .release = ha_close,
+ .releasedir = ha_closedir,
+ .forget = ha_forget,
+};
diff --git a/xlators/cluster/ha/src/ha.h b/xlators/cluster/ha/src/ha.h
new file mode 100644
index 000000000..77a04f165
--- /dev/null
+++ b/xlators/cluster/ha/src/ha.h
@@ -0,0 +1,59 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __HA_H_
+#define __HA_H_
+
+typedef struct {
+ call_stub_t *stub;
+ int32_t op_ret, op_errno;
+ int32_t active, tries, revalidate, revalidate_error;
+ int32_t call_count;
+ char *state, *pattern;
+ dict_t *dict;
+ loc_t *loc;
+ struct stat buf;
+ fd_t *fd;
+ inode_t *inode;
+ int32_t flags;
+ int32_t first_success;
+} ha_local_t;
+
+typedef struct {
+ char *state;
+ xlator_t **children;
+ int child_count, pref_subvol;
+} ha_private_t;
+
+typedef struct {
+ char *fdstate;
+ char *path;
+ gf_lock_t lock;
+ int active;
+} hafd_t;
+
+#define HA_ACTIVE_CHILD(this, local) (((ha_private_t *)this->private)->children[local->active])
+
+extern int ha_alloc_init_fd (call_frame_t *frame, fd_t *fd);
+
+extern int ha_handle_cbk (call_frame_t *frame, void *cookie, int op_ret, int op_errno) ;
+
+extern int ha_alloc_init_inode (call_frame_t *frame, inode_t *inode);
+
+#endif
diff --git a/xlators/cluster/map/Makefile.am b/xlators/cluster/map/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/cluster/map/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/map/src/Makefile.am b/xlators/cluster/map/src/Makefile.am
new file mode 100644
index 000000000..44ee4d9ee
--- /dev/null
+++ b/xlators/cluster/map/src/Makefile.am
@@ -0,0 +1,15 @@
+xlator_LTLIBRARIES = map.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+map_la_LDFLAGS = -module -avoidversion
+
+map_la_SOURCES = map.c map-helper.c
+map_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = map.h
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES =
+
diff --git a/xlators/cluster/map/src/map-helper.c b/xlators/cluster/map/src/map-helper.c
new file mode 100644
index 000000000..4e51219d4
--- /dev/null
+++ b/xlators/cluster/map/src/map-helper.c
@@ -0,0 +1,357 @@
+/*
+ Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "map.h"
+
+
+xlator_t *
+map_subvol_next (xlator_t *this, xlator_t *prev)
+{
+ map_private_t *priv = NULL;
+ xlator_t *next = NULL;
+ int i = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->xlarray[i].xl == prev) {
+ if ((i + 1) < priv->child_count)
+ next = priv->xlarray[i + 1].xl;
+ break;
+ }
+ }
+
+ return next;
+}
+
+int
+map_subvol_cnt (xlator_t *this, xlator_t *subvol)
+{
+ int i = 0;
+ int ret = -1;
+ map_private_t *priv = NULL;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (subvol == priv->xlarray[i].xl) {
+ ret = i;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+int
+map_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p)
+{
+ map_private_t *priv = NULL;
+ int cnt = 0;
+ int max = 0;
+ uint64_t y = 0;
+
+ if (x == ((uint64_t) -1)) {
+ y = (uint64_t) -1;
+ goto out;
+ }
+
+ priv = this->private;
+
+ max = priv->child_count;
+ cnt = map_subvol_cnt (this, subvol);
+
+ y = ((x * max) + cnt);
+
+out:
+ if (y_p)
+ *y_p = y;
+
+ return 0;
+}
+
+
+int
+map_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p,
+ uint64_t *x_p)
+{
+ int cnt = 0;
+ int max = 0;
+ uint64_t x = 0;
+ xlator_t *subvol = 0;
+ map_private_t *priv = NULL;
+
+ priv = this->private;
+ max = priv->child_count;
+
+ cnt = y % max;
+ x = y / max;
+
+ subvol = priv->xlarray[cnt].xl;
+
+ if (subvol_p)
+ *subvol_p = subvol;
+
+ if (x_p)
+ *x_p = x;
+
+ return 0;
+}
+
+
+xlator_t *
+get_mapping_subvol_from_path (xlator_t *this, const char *path)
+{
+ map_private_t *priv = NULL;
+ struct map_pattern *map = NULL;
+
+ /* To make sure we handle '/' properly */
+ if (!strcmp (path, "/"))
+ return NULL;
+
+ priv = this->private;
+
+ map = priv->map;
+ while (map) {
+ if (!strncmp (map->directory, path, map->dir_len)) {
+ if ((path[map->dir_len] == '/') ||
+ (path[map->dir_len] == '\0')) {
+ return map->xl;
+ }
+ }
+
+ map = map->next;
+ }
+
+ return priv->default_xl;
+}
+
+xlator_t *
+get_mapping_subvol_from_ctx (xlator_t *this, inode_t *inode)
+{
+ uint64_t subvol = 0;
+ int ret = -1;
+
+ ret = inode_ctx_get (inode, this, &subvol);
+ if (ret != 0)
+ return NULL;
+
+ return (xlator_t *)(long)subvol;
+}
+
+int
+check_multiple_volume_entry (xlator_t *this,
+ xlator_t *subvol)
+{
+ int ret = -1;
+ int idx = 0;
+ map_private_t *priv = NULL;
+
+ priv = this->private;
+
+ for (idx = 0; idx < priv->child_count; idx++) {
+ if (priv->xlarray[idx].xl == subvol) {
+ if (priv->xlarray[idx].mapped) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume '%s' is already mapped",
+ subvol->name);
+ goto out;
+ }
+ priv->xlarray[idx].mapped = 1;
+ ret = 0;
+ goto out;
+ }
+ }
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume '%s' is not found",
+ subvol->name);
+
+ out:
+ return ret;
+}
+
+int
+verify_dir_and_assign_subvol (xlator_t *this,
+ const char *directory,
+ const char *subvol)
+{
+ int default_flag = 0;
+ int ret = -1;
+ int idx = 0;
+ map_private_t *priv = NULL;
+ xlator_list_t *trav = NULL;
+ struct map_pattern *tmp_map = NULL;
+
+ priv = this->private;
+
+ /* check if directory is valid, ie, its a top level dir, and
+ * not includes a '*' in it.
+ */
+ if (!strcmp ("*", directory)) {
+ default_flag = 1;
+ } else {
+ if (directory[0] != '/') {
+ gf_log (this->name, GF_LOG_ERROR,
+ "map takes absolute path, starting with '/'. "
+ "not '%s'", directory);
+ goto out;
+ }
+ for (idx = 1; idx < (strlen (directory) - 1); idx++) {
+ if (directory[idx] == '/') {
+ gf_log (this->name, GF_LOG_ERROR,
+ "map takes only top level directory, "
+ "not '%s'", directory);
+ goto out;
+ }
+ }
+ }
+
+ /* Assign proper subvolume */
+ trav = this->children;
+ while (trav) {
+ if (!strcmp (trav->xlator->name, subvol)) {
+
+ /* Check if there is another directory for
+ * same volume, if yes, return error.
+ */
+ ret = check_multiple_volume_entry (this,
+ trav->xlator);
+ if (ret != 0) {
+ goto out;
+ }
+
+ ret = 0;
+ if (default_flag) {
+ if (priv->default_xl) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "'*' specified more than "
+ "once. don't confuse me!!!");
+ }
+
+ priv->default_xl = trav->xlator;
+ goto out;
+ }
+
+ tmp_map = CALLOC (1, sizeof (struct map_pattern));
+ tmp_map->xl = trav->xlator;
+ tmp_map->dir_len = strlen (directory);
+
+ /* make sure that the top level directory starts
+ * with '/' and ends without '/'
+ */
+ tmp_map->directory = strdup (directory);
+ if (directory[tmp_map->dir_len - 1] == '/') {
+ tmp_map->dir_len--;
+ }
+
+ if (!priv->map)
+ priv->map = tmp_map;
+ else {
+ struct map_pattern *trav_map = NULL;
+ trav_map = priv->map;
+ while (trav_map->next)
+ trav_map = trav_map->next;
+ trav_map->next = tmp_map;
+ }
+
+ goto out;
+ }
+
+ trav = trav->next;
+ }
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "map volume '%s' is not proper subvolume", subvol);
+
+ out:
+ return ret;
+}
+
+int
+assign_default_subvol (xlator_t *this, const char *default_xl)
+{
+ int ret = -1;
+ map_private_t *priv = NULL;
+ xlator_list_t *trav = NULL;
+
+ priv = this->private;
+ trav = this->children;
+
+ while (trav) {
+ if (!strcmp (trav->xlator->name, default_xl)) {
+ ret = check_multiple_volume_entry (this,
+ trav->xlator);
+ if (ret != 0) {
+ goto out;
+ }
+ if (priv->default_xl)
+ gf_log (this->name, GF_LOG_WARNING,
+ "default-volume option provided, "
+ "overriding earlier '*' option");
+ priv->default_xl = trav->xlator;
+ return 0;
+ }
+ trav = trav->next;
+ }
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "default-volume value is not an valid subvolume. check again");
+ out:
+ return -1;
+}
+
+void
+verify_if_all_subvolumes_got_used (xlator_t *this)
+{
+ int idx = 0;
+ map_private_t *priv = NULL;
+
+ priv = this->private;
+
+ for (idx = 0; idx < priv->child_count; idx++) {
+ if (!priv->xlarray[idx].mapped) {
+ if (!priv->default_xl) {
+ priv->default_xl = priv->xlarray[idx].xl;
+ priv->xlarray[idx].mapped = 1;
+ } else {
+ gf_log (this->name, GF_LOG_WARNING,
+ "subvolume '%s' is not mapped to "
+ "any directory",
+ priv->xlarray[idx].xl->name);
+ }
+ }
+ }
+
+ if (!priv->default_xl) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "default subvolume not specified, filesystem "
+ "may not work properly. Check 'map' translator "
+ "documentation for more info");
+ }
+
+ return ;
+}
diff --git a/xlators/cluster/map/src/map.c b/xlators/cluster/map/src/map.c
new file mode 100644
index 000000000..8c4b7c83c
--- /dev/null
+++ b/xlators/cluster/map/src/map.c
@@ -0,0 +1,2193 @@
+/*
+ Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "map.h"
+
+/* For <op>_cbk functions */
+#include "defaults.c"
+
+
+int32_t
+map_stat (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_stat_cbk,
+ subvol,
+ subvol->fops->stat,
+ loc);
+
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int32_t
+map_chmod (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ mode_t mode)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_chmod_cbk,
+ subvol,
+ subvol->fops->chmod,
+ loc,
+ mode);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_fchmod (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ mode_t mode)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_fchmod_cbk,
+ subvol,
+ subvol->fops->fchmod,
+ fd,
+ mode);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_chown (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ uid_t uid,
+ gid_t gid)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_chown_cbk,
+ subvol,
+ subvol->fops->chown,
+ loc,
+ uid,
+ gid);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_fchown (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ uid_t uid,
+ gid_t gid)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_fchown_cbk,
+ subvol,
+ subvol->fops->fchown,
+ fd,
+ uid,
+ gid);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_truncate (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ off_t offset)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_truncate_cbk,
+ subvol,
+ subvol->fops->truncate,
+ loc,
+ offset);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_ftruncate (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ off_t offset)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_ftruncate_cbk,
+ subvol,
+ subvol->fops->ftruncate,
+ fd,
+ offset);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_utimens (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ struct timespec tv[2])
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_utimens_cbk,
+ subvol,
+ subvol->fops->utimens,
+ loc,
+ tv);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_access (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t mask)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_access_cbk,
+ subvol,
+ subvol->fops->access,
+ loc,
+ mask);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int32_t
+map_readlink (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ size_t size)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_readlink_cbk,
+ subvol,
+ subvol->fops->readlink,
+ loc,
+ size);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_unlink (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_unlink_cbk,
+ subvol,
+ subvol->fops->unlink,
+ loc);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_rmdir (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_rmdir_cbk,
+ subvol,
+ subvol->fops->rmdir,
+ loc);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int32_t
+map_rename (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *oldloc,
+ loc_t *newloc)
+{
+ int32_t op_errno = 1;
+ xlator_t *old_subvol = NULL;
+ xlator_t *new_subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (oldloc, err);
+ VALIDATE_OR_GOTO (oldloc->inode, err);
+ VALIDATE_OR_GOTO (oldloc->path, err);
+ VALIDATE_OR_GOTO (newloc, err);
+
+ old_subvol = get_mapping_subvol_from_ctx (this, oldloc->inode);
+ if (!old_subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (newloc->path) {
+ new_subvol = get_mapping_subvol_from_path (this,
+ newloc->path);
+ if (new_subvol && (new_subvol != old_subvol)) {
+ op_errno = EXDEV;
+ goto err;
+ }
+ }
+
+ STACK_WIND (frame,
+ default_rename_cbk,
+ old_subvol,
+ old_subvol->fops->rename,
+ oldloc, newloc);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int32_t
+map_link (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *oldloc,
+ loc_t *newloc)
+{
+ int32_t op_errno = 1;
+ xlator_t *old_subvol = NULL;
+ xlator_t *new_subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (oldloc, err);
+ VALIDATE_OR_GOTO (oldloc->inode, err);
+ VALIDATE_OR_GOTO (oldloc->path, err);
+ VALIDATE_OR_GOTO (newloc, err);
+
+ old_subvol = get_mapping_subvol_from_ctx (this, oldloc->inode);
+ if (!old_subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (newloc->path) {
+ new_subvol = get_mapping_subvol_from_path (this,
+ newloc->path);
+ if (new_subvol && (new_subvol != old_subvol)) {
+ op_errno = EXDEV;
+ goto err;
+ }
+ }
+
+ STACK_WIND (frame,
+ default_link_cbk,
+ old_subvol,
+ old_subvol->fops->link,
+ oldloc, newloc);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int32_t
+map_open (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags, fd_t *fd)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_open_cbk,
+ subvol,
+ subvol->fops->open,
+ loc, flags, fd);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_readv (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t offset)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_readv_cbk,
+ subvol,
+ subvol->fops->readv,
+ fd,
+ size,
+ offset);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_writev (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ struct iovec *vector,
+ int32_t count,
+ off_t off)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_writev_cbk,
+ subvol,
+ subvol->fops->writev,
+ fd,
+ vector,
+ count,
+ off);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_flush (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_flush_cbk,
+ subvol,
+ subvol->fops->flush,
+ fd);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int32_t
+map_fsync (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t flags)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_fsync_cbk,
+ subvol,
+ subvol->fops->fsync,
+ fd,
+ flags);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_fstat (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_fstat_cbk,
+ subvol,
+ subvol->fops->fstat,
+ fd);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_getdents (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t offset,
+ int32_t flag)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_getdents_cbk,
+ subvol,
+ subvol->fops->getdents,
+ fd,
+ size,
+ offset,
+ flag);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_setdents (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t flags,
+ dir_entry_t *entries,
+ int32_t count)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_setdents_cbk,
+ subvol,
+ subvol->fops->setdents,
+ fd,
+ flags,
+ entries,
+ count);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int32_t
+map_fsyncdir (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t flags)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_fsyncdir_cbk,
+ subvol,
+ subvol->fops->fsyncdir,
+ fd,
+ flags);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+
+
+int32_t
+map_setxattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ dict_t *dict,
+ int32_t flags)
+{
+ /* TODO: support for 'get' 'put' API */
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_setxattr_cbk,
+ subvol,
+ subvol->fops->setxattr,
+ loc,
+ dict,
+ flags);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_getxattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ const char *name)
+{
+ /* TODO: support for 'get' 'put' API */
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_getxattr_cbk,
+ subvol,
+ subvol->fops->getxattr,
+ loc,
+ name);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_xattrop (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ gf_xattrop_flags_t flags,
+ dict_t *dict)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_xattrop_cbk,
+ subvol,
+ subvol->fops->xattrop,
+ loc,
+ flags,
+ dict);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_fxattrop (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ gf_xattrop_flags_t flags,
+ dict_t *dict)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_fxattrop_cbk,
+ subvol,
+ subvol->fops->fxattrop,
+ fd,
+ flags,
+ dict);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_removexattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ const char *name)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_removexattr_cbk,
+ subvol,
+ subvol->fops->removexattr,
+ loc,
+ name);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_lk (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t cmd,
+ struct flock *lock)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_lk_cbk,
+ subvol,
+ subvol->fops->lk,
+ fd,
+ cmd,
+ lock);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int32_t
+map_inodelk (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t cmd, struct flock *lock)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_inodelk_cbk,
+ subvol,
+ subvol->fops->inodelk,
+ loc, cmd, lock);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int32_t
+map_finodelk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t cmd, struct flock *lock)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_finodelk_cbk,
+ subvol,
+ subvol->fops->finodelk,
+ fd, cmd, lock);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_entrylk (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *basename,
+ entrylk_cmd cmd, entrylk_type type)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, default_entrylk_cbk,
+ subvol,
+ subvol->fops->entrylk,
+ loc, basename, cmd, type);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_fentrylk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *basename,
+ entrylk_cmd cmd, entrylk_type type)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, default_fentrylk_cbk,
+ subvol,
+ subvol->fops->fentrylk,
+ fd, basename, cmd, type);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_checksum (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flag)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame,
+ default_checksum_cbk,
+ subvol,
+ subvol->fops->checksum,
+ loc,
+ flag);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+static int32_t
+map_newentry_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf)
+{
+ call_frame_t *prev = NULL;
+ prev = cookie;
+
+ map_itransform (this, prev->this, buf->st_ino, &buf->st_ino);
+
+ STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
+ return 0;
+
+}
+
+
+int32_t
+map_mknod (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ mode_t mode,
+ dev_t rdev)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+
+ subvol = get_mapping_subvol_from_path (this, loc->path);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol);
+ if (op_errno != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: failed to set subvolume ptr in inode ctx",
+ loc->path);
+ }
+
+ STACK_WIND (frame,
+ map_newentry_cbk,
+ subvol,
+ subvol->fops->mknod,
+ loc, mode, rdev);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_mkdir (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ mode_t mode)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+
+ subvol = get_mapping_subvol_from_path (this, loc->path);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol);
+ if (op_errno != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: failed to set subvolume ptr in inode ctx",
+ loc->path);
+ }
+
+ STACK_WIND (frame,
+ map_newentry_cbk,
+ subvol,
+ subvol->fops->mkdir,
+ loc, mode);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int32_t
+map_symlink (call_frame_t *frame,
+ xlator_t *this,
+ const char *linkpath,
+ loc_t *loc)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+
+ subvol = get_mapping_subvol_from_path (this, loc->path);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol);
+ if (op_errno != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: failed to set subvolume ptr in inode ctx",
+ loc->path);
+ }
+
+ STACK_WIND (frame,
+ map_newentry_cbk,
+ subvol,
+ subvol->fops->symlink,
+ linkpath, loc);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+static int32_t
+map_create_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd,
+ inode_t *inode,
+ struct stat *buf)
+{
+ call_frame_t *prev = NULL;
+ prev = cookie;
+
+ map_itransform (this, prev->this, buf->st_ino, &buf->st_ino);
+ STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf);
+ return 0;
+}
+
+int32_t
+map_create (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags,
+ mode_t mode, fd_t *fd)
+{
+ int32_t op_errno = 1;
+ xlator_t *subvol = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+
+ subvol = get_mapping_subvol_from_path (this, loc->path);
+ if (!subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol);
+ if (op_errno != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: failed to set subvolume ptr in inode ctx",
+ loc->path);
+ }
+
+ STACK_WIND (frame, map_create_cbk,
+ subvol,
+ subvol->fops->create,
+ loc, flags, mode, fd);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+map_single_lookup_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf,
+ dict_t *dict)
+{
+ call_frame_t *prev = NULL;
+ prev = cookie;
+
+ map_itransform (this, prev->this, buf->st_ino, &buf->st_ino);
+
+ STACK_UNWIND (frame, op_ret, op_errno, inode, buf, dict);
+
+ return 0;
+}
+
+int32_t
+map_lookup_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf,
+ dict_t *dict)
+{
+ int callcnt = 0;
+ map_local_t *local = NULL;
+ inode_t *tmp_inode = NULL;
+ dict_t *tmp_dict = NULL;
+
+ local = frame->local;
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if ((op_ret == 0) && (local->op_ret == -1)) {
+ local->op_ret = 0;
+ local->stbuf = *buf;
+ if (dict)
+ local->dict = dict_ref (dict);
+ local->inode = inode_ref (inode);
+ }
+ if (op_ret == -1)
+ local->op_errno = op_errno;
+
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ tmp_dict = local->dict;
+ tmp_inode = local->inode;
+
+ STACK_UNWIND (frame, local->op_ret,
+ local->op_errno, local->inode,
+ &local->stbuf, local->dict);
+
+ inode_unref (local->inode);
+ if (tmp_dict)
+ dict_unref (tmp_dict);
+ }
+
+ return 0;
+}
+
+int32_t
+map_lookup (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ dict_t *xattr_req)
+{
+ int32_t op_errno = EINVAL;
+ xlator_t *subvol = NULL;
+ map_local_t *local = NULL;
+ map_private_t *priv = NULL;
+ xlator_list_t *trav = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+
+ priv = this->private;
+
+ if (loc->inode->ino == 1)
+ goto root_inode;
+
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ subvol = get_mapping_subvol_from_path (this, loc->path);
+ if (!subvol) {
+ goto err;
+ }
+
+ op_errno = inode_ctx_put (loc->inode, this,
+ (uint64_t)(long)subvol);
+ if (op_errno != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: failed to set subvolume in inode ctx",
+ loc->path);
+ }
+ }
+
+ /* Just one callback */
+ STACK_WIND (frame,
+ map_single_lookup_cbk,
+ subvol,
+ subvol->fops->lookup,
+ loc,
+ xattr_req);
+
+ return 0;
+
+ root_inode:
+ local = CALLOC (1, sizeof (map_local_t));
+
+ frame->local = local;
+ local->call_count = priv->child_count;
+ local->op_ret = -1;
+
+ trav = this->children;
+ while (trav) {
+ STACK_WIND (frame,
+ map_lookup_cbk,
+ trav->xlator,
+ trav->xlator->fops->lookup,
+ loc,
+ xattr_req);
+ trav = trav->next;
+ }
+
+ return 0;
+
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+/*
+ * unify_normalize_stats -
+ */
+void
+map_normalize_stats (struct statvfs *buf,
+ unsigned long bsize,
+ unsigned long frsize)
+{
+ double factor;
+
+ if (buf->f_bsize != bsize) {
+ factor = ((double) buf->f_bsize) / bsize;
+ buf->f_bsize = bsize;
+ buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree);
+ buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail);
+ }
+
+ if (buf->f_frsize != frsize) {
+ factor = ((double) buf->f_frsize) / frsize;
+ buf->f_frsize = frsize;
+ buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks);
+ }
+}
+
+
+int32_t
+map_statfs_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct statvfs *stbuf)
+{
+ struct statvfs *dict_buf = NULL;
+ map_local_t *local = NULL;
+ int this_call_cnt = 0;
+ unsigned long bsize;
+ unsigned long frsize;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ this_call_cnt = --local->call_count;
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto unlock;
+ }
+ local->op_ret = 0;
+
+ /* when a call is successfull, add it to local->dict */
+ dict_buf = &local->statvfs;
+
+ if (dict_buf->f_bsize != 0) {
+ bsize = max (dict_buf->f_bsize,
+ stbuf->f_bsize);
+
+ frsize = max (dict_buf->f_frsize,
+ stbuf->f_frsize);
+ map_normalize_stats(dict_buf, bsize, frsize);
+ map_normalize_stats(stbuf, bsize, frsize);
+ } else {
+ dict_buf->f_bsize = stbuf->f_bsize;
+ dict_buf->f_frsize = stbuf->f_frsize;
+ }
+
+ dict_buf->f_blocks += stbuf->f_blocks;
+ dict_buf->f_bfree += stbuf->f_bfree;
+ dict_buf->f_bavail += stbuf->f_bavail;
+ dict_buf->f_files += stbuf->f_files;
+ dict_buf->f_ffree += stbuf->f_ffree;
+ dict_buf->f_favail += stbuf->f_favail;
+ dict_buf->f_fsid = stbuf->f_fsid;
+ dict_buf->f_flag = stbuf->f_flag;
+ dict_buf->f_namemax = stbuf->f_namemax;
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ if (!this_call_cnt) {
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->statvfs);
+ }
+
+ return 0;
+}
+
+int32_t
+map_statfs (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ int32_t op_errno = EINVAL;
+ xlator_t *subvol = NULL;
+ map_local_t *local = NULL;
+ map_private_t *priv = NULL;
+ xlator_list_t *trav = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO (loc->path, err);
+ VALIDATE_OR_GOTO (loc->inode, err);
+
+ if (loc->inode->ino == 1)
+ goto root_inode;
+ subvol = get_mapping_subvol_from_ctx (this, loc->inode);
+ if (!subvol) {
+ goto err;
+ }
+
+ /* Just one callback */
+ STACK_WIND (frame,
+ default_statfs_cbk,
+ subvol,
+ subvol->fops->statfs,
+ loc);
+
+ return 0;
+
+ root_inode:
+ local = CALLOC (1, sizeof (map_local_t));
+
+ priv = this->private;
+ frame->local = local;
+ local->call_count = priv->child_count;
+ local->op_ret = -1;
+
+ trav = this->children;
+ while (trav) {
+ STACK_WIND (frame,
+ map_statfs_cbk,
+ trav->xlator,
+ trav->xlator->fops->statfs,
+ loc);
+ trav = trav->next;
+ }
+
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+int32_t
+map_opendir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd)
+{
+ int callcnt = 0;
+ map_local_t *local = NULL;
+ fd_t *local_fd = NULL;
+
+ local = frame->local;
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto unlock;
+ }
+
+ local->op_ret = 0;
+ }
+ unlock:
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ local_fd = local->fd;
+ local->fd = NULL;
+
+ STACK_UNWIND (frame, local->op_ret,
+ local->op_errno, local_fd);
+
+ fd_unref (local_fd);
+ }
+ return 0;
+}
+
+
+int32_t
+map_opendir (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc, fd_t *fd)
+{
+ int32_t op_errno = EINVAL;
+ xlator_t *subvol = NULL;
+ map_local_t *local = NULL;
+ map_private_t *priv = NULL;
+ xlator_list_t *trav = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ if (loc->inode->ino == 1)
+ goto root_inode;
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ goto err;
+ }
+
+ /* Just one callback */
+ STACK_WIND (frame,
+ default_opendir_cbk,
+ subvol,
+ subvol->fops->opendir,
+ loc, fd);
+ return 0;
+
+ root_inode:
+ local = CALLOC (1, sizeof (map_local_t));
+
+ priv = this->private;
+ frame->local = local;
+ local->call_count = priv->child_count;
+ local->op_ret = -1;
+ local->fd = fd_ref (fd);
+
+ trav = this->children;
+ while (trav) {
+ STACK_WIND (frame,
+ map_opendir_cbk,
+ trav->xlator,
+ trav->xlator->fops->opendir,
+ loc, fd);
+ trav = trav->next;
+ }
+
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int32_t
+map_single_readdir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ gf_dirent_t *entries)
+{
+ call_frame_t *prev = NULL;
+ gf_dirent_t *orig_entry = NULL;
+
+ prev = cookie;
+
+ list_for_each_entry (orig_entry, &entries->list, list) {
+ map_itransform (this, prev->this, orig_entry->d_ino,
+ &orig_entry->d_ino);
+ }
+ STACK_UNWIND (frame, op_ret, op_errno, entries);
+
+ return 0;
+}
+
+
+int
+map_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *orig_entries)
+{
+ map_local_t *local = NULL;
+ gf_dirent_t entries;
+ gf_dirent_t *orig_entry = NULL;
+ gf_dirent_t *entry = NULL;
+ call_frame_t *prev = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *next = NULL;
+ int count = 0;
+ fd_t *local_fd = NULL;
+
+ INIT_LIST_HEAD (&entries.list);
+ prev = cookie;
+ local = frame->local;
+
+ if (op_ret < 0)
+ goto done;
+
+ list_for_each_entry (orig_entry, &orig_entries->list, list) {
+ subvol = prev->this;
+
+ entry = gf_dirent_for_name (orig_entry->d_name);
+ if (!entry) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ goto unwind;
+ }
+
+ map_itransform (this, subvol, orig_entry->d_ino,
+ &entry->d_ino);
+ map_itransform (this, subvol, orig_entry->d_off,
+ &entry->d_off);
+
+ entry->d_type = orig_entry->d_type;
+ entry->d_len = orig_entry->d_len;
+
+ list_add_tail (&entry->list, &entries.list);
+ count++;
+ }
+
+ op_ret = count;
+
+done:
+ if (count == 0) {
+ next = map_subvol_next (this, prev->this);
+ if (!next) {
+ goto unwind;
+ }
+
+ STACK_WIND (frame, map_readdir_cbk,
+ next, next->fops->readdir,
+ local->fd, local->size, 0);
+ return 0;
+ }
+
+unwind:
+ if (op_ret < 0)
+ op_ret = 0;
+
+ local_fd = local->fd;
+ local->fd = NULL;
+
+ STACK_UNWIND (frame, op_ret, op_errno, &entries);
+
+ fd_unref (local_fd);
+
+ gf_dirent_free (&entries);
+
+ return 0;
+}
+
+
+int32_t
+map_readdir (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t yoff)
+{
+ int32_t op_errno = EINVAL;
+ xlator_t *subvol = NULL;
+ map_local_t *local = NULL;
+ map_private_t *priv = NULL;
+ xlator_t *xvol = NULL;
+ off_t xoff = 0;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ if (fd->inode->ino == 1)
+ goto root_inode;
+
+ subvol = get_mapping_subvol_from_ctx (this, fd->inode);
+ if (!subvol) {
+ goto err;
+ }
+
+ /* Just one callback */
+
+ STACK_WIND (frame,
+ map_single_readdir_cbk,
+ subvol,
+ subvol->fops->readdir,
+ fd, size, yoff);
+ return 0;
+
+ root_inode:
+ /* readdir on '/' */
+ local = CALLOC (1, sizeof (map_local_t));
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "memory allocation failed :(");
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ priv = this->private;
+ frame->local = local;
+ local->op_errno = ENOENT;
+ local->op_ret = -1;
+
+ local->fd = fd_ref (fd);
+ local->size = size;
+
+ map_deitransform (this, yoff, &xvol, (uint64_t *)&xoff);
+
+ STACK_WIND (frame, map_readdir_cbk,
+ xvol, xvol->fops->readdir,
+ fd, size, xoff);
+
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+#if 0
+/* TODO : do it later as currently only unify uses this mop and mostly
+ unify will be used below map */
+int32_t
+map_stats_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct xlator_stats *stats)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, stats);
+ return 0;
+}
+
+
+int32_t
+map_stats (call_frame_t *frame,
+ xlator_t *this,
+ int32_t flags)
+{
+ STACK_WIND (frame,
+ map_stats_cbk,
+ subvol,
+ subvol->mops->stats,
+ flags);
+ return 0;
+ err:
+ STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+#endif /* if 0 */
+
+
+/* TODO: define the behavior of notify */
+
+
+void
+fini (xlator_t *this)
+{
+ map_private_t *priv = NULL;
+ struct map_pattern *trav_map = NULL;
+ struct map_pattern *tmp_map = NULL;
+
+ priv = this->private;
+
+ if (priv) {
+ if (priv->xlarray)
+ FREE (priv->xlarray);
+
+ trav_map = priv->map;
+ while (trav_map) {
+ tmp_map = trav_map;
+ trav_map = trav_map->next;
+ FREE (tmp_map);
+ }
+
+ FREE(priv);
+ }
+
+ return;
+}
+
+int
+init (xlator_t *this)
+{
+ map_private_t *priv = NULL;
+ xlator_list_t *trav = NULL;
+ int count = 0;
+ int ret = -1;
+ char *pattern_string = NULL;
+ char *map_pair_str = NULL;
+ char *tmp_str = NULL;
+ char *tmp_str1 = NULL;
+ char *dup_map_pair = NULL;
+ char *dir_str = NULL;
+ char *subvol_str = NULL;
+ char *default_xl = NULL;
+
+ if (!this->children) {
+ gf_log (this->name,GF_LOG_ERROR,
+ "FATAL: map should have one or more child defined");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ priv = CALLOC (1, sizeof (map_private_t));
+ this->private = priv;
+
+ /* allocate xlator array */
+ trav = this->children;
+ while (trav) {
+ count++;
+ trav = trav->next;
+ }
+ priv->xlarray = CALLOC (1, sizeof (struct map_xlator_array) * count);
+ priv->child_count = count;
+
+ /* build xlator array */
+ count = 0;
+ trav = this->children;
+ while (trav) {
+ priv->xlarray[count++].xl = trav->xlator;
+ trav = trav->next;
+ }
+
+ /* map dir1:brick1;dir2:brick2;dir3:brick3;*:brick4 */
+ ret = dict_get_str (this->options, "map-directory", &pattern_string);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "map.pattern not given, can't continue");
+ goto err;
+ }
+ map_pair_str = strtok_r (pattern_string, ";", &tmp_str);
+ while (map_pair_str) {
+ dup_map_pair = strdup (map_pair_str);
+ dir_str = strtok_r (dup_map_pair, ":", &tmp_str1);
+ if (!dir_str) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "directory string invalid");
+ goto err;
+ }
+ subvol_str = strtok_r (NULL, ":", &tmp_str1);
+ if (!subvol_str) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "mapping subvolume string invalid");
+ goto err;
+ }
+ ret = verify_dir_and_assign_subvol (this,
+ dir_str,
+ subvol_str);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "verification failed");
+ goto err;
+ }
+
+ FREE (dup_map_pair);
+
+ map_pair_str = strtok_r (NULL, ";", &tmp_str);
+ }
+
+ /* default-volume brick4 */
+ ret = dict_get_str (this->options, "default-volume", &default_xl);
+ if (ret == 0) {
+ ret = assign_default_subvol (this, default_xl);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "assigning default failed");
+ goto err;
+ }
+ }
+
+ verify_if_all_subvolumes_got_used (this);
+
+ return 0;
+ err:
+ fini (this);
+ return -1;
+}
+
+
+struct xlator_fops fops = {
+ .lookup = map_lookup,
+ .mknod = map_mknod,
+ .create = map_create,
+
+ .stat = map_stat,
+ .chmod = map_chmod,
+ .chown = map_chown,
+ .fchown = map_fchown,
+ .fchmod = map_fchmod,
+ .fstat = map_fstat,
+ .utimens = map_utimens,
+ .truncate = map_truncate,
+ .ftruncate = map_ftruncate,
+ .access = map_access,
+ .readlink = map_readlink,
+ .setxattr = map_setxattr,
+ .getxattr = map_getxattr,
+ .removexattr = map_removexattr,
+ .open = map_open,
+ .readv = map_readv,
+ .writev = map_writev,
+ .flush = map_flush,
+ .fsync = map_fsync,
+ .statfs = map_statfs,
+ .lk = map_lk,
+ .opendir = map_opendir,
+ .readdir = map_readdir,
+ .fsyncdir = map_fsyncdir,
+ .symlink = map_symlink,
+ .unlink = map_unlink,
+ .link = map_link,
+ .mkdir = map_mkdir,
+ .rmdir = map_rmdir,
+ .rename = map_rename,
+ .inodelk = map_inodelk,
+ .finodelk = map_finodelk,
+ .entrylk = map_entrylk,
+ .fentrylk = map_fentrylk,
+ .xattrop = map_xattrop,
+ .fxattrop = map_fxattrop,
+ .setdents = map_setdents,
+ .getdents = map_getdents,
+ .checksum = map_checksum,
+};
+
+struct xlator_mops mops = {
+};
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {"map-directory"},
+ .type = GF_OPTION_TYPE_ANY
+ },
+ { .key = {"default-volume"},
+ .type = GF_OPTION_TYPE_XLATOR
+ },
+
+ { .key = {NULL} }
+};
diff --git a/xlators/cluster/map/src/map.h b/xlators/cluster/map/src/map.h
new file mode 100644
index 000000000..0f1aabfd6
--- /dev/null
+++ b/xlators/cluster/map/src/map.h
@@ -0,0 +1,76 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __MAP_H__
+#define __MAP_H__
+
+#include "xlator.h"
+
+struct map_pattern {
+ struct map_pattern *next;
+ xlator_t *xl;
+ char *directory;
+ int dir_len;
+};
+
+struct map_xlator_array {
+ xlator_t *xl;
+ int mapped; /* yes/no */
+};
+
+typedef struct {
+ struct map_pattern *map;
+ xlator_t *default_xl;
+ struct map_xlator_array *xlarray;
+ int child_count;
+} map_private_t;
+
+typedef struct {
+ int32_t op_ret;
+ int32_t op_errno;
+ int call_count;
+ struct statvfs statvfs;
+ struct stat stbuf;
+ inode_t *inode;
+ dict_t *dict;
+ fd_t *fd;
+
+ size_t size;
+} map_local_t;
+
+xlator_t *map_subvol_next (xlator_t *this, xlator_t *prev);
+int map_subvol_cnt (xlator_t *this, xlator_t *subvol);
+
+int map_itransform (xlator_t *this, xlator_t *subvol,
+ uint64_t x, uint64_t *y_p);
+int map_deitransform (xlator_t *this, uint64_t y,
+ xlator_t **subvol_p, uint64_t *x_p);
+
+
+xlator_t *get_mapping_subvol_from_path (xlator_t *this, const char *path);
+xlator_t *get_mapping_subvol_from_ctx (xlator_t *this, inode_t *inode);
+
+int check_multiple_volume_entry (xlator_t *this, xlator_t *subvol);
+int verify_dir_and_assign_subvol (xlator_t *this,
+ const char *directory, const char *subvol);
+int assign_default_subvol (xlator_t *this, const char *default_xl);
+void verify_if_all_subvolumes_got_used (xlator_t *this);
+
+
+#endif /* __MAP_H__ */
diff --git a/xlators/cluster/stripe/Makefile.am b/xlators/cluster/stripe/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/cluster/stripe/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/stripe/src/Makefile.am b/xlators/cluster/stripe/src/Makefile.am
new file mode 100644
index 000000000..60e0a1568
--- /dev/null
+++ b/xlators/cluster/stripe/src/Makefile.am
@@ -0,0 +1,14 @@
+
+xlator_LTLIBRARIES = stripe.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+stripe_la_LDFLAGS = -module -avoidversion
+
+stripe_la_SOURCES = stripe.c
+stripe_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES =
+
diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c
new file mode 100644
index 000000000..83787ca2a
--- /dev/null
+++ b/xlators/cluster/stripe/src/stripe.c
@@ -0,0 +1,3286 @@
+/*
+ Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * xlators/cluster/stripe:
+ * Stripe translator, stripes the data accross its child nodes,
+ * as per the options given in the volfile. The striping works
+ * fairly simple. It writes files at different offset as per
+ * calculation. So, 'ls -l' output at the real posix level will
+ * show file size bigger than the actual size. But when one does
+ * 'df' or 'du <file>', real size of the file on the server is shown.
+ *
+ * WARNING:
+ * Stripe translator can't regenerate data if a child node gets disconnected.
+ * So, no 'self-heal' for stripe. Hence the advice, use stripe only when its
+ * very much necessary, or else, use it in combination with AFR, to have a
+ * backup copy.
+ */
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "logging.h"
+#include "defaults.h"
+#include "compat.h"
+#include "compat-errno.h"
+#include <fnmatch.h>
+#include <signal.h>
+
+#define STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR(_loc) do { \
+ if (!(_loc && _loc->inode)) { \
+ STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); \
+ return 0; \
+ } \
+} while(0)
+
+/**
+ * struct stripe_options : This keeps the pattern and the block-size
+ * information, which is used for striping on a file.
+ */
+struct stripe_options {
+ struct stripe_options *next;
+ char path_pattern[256];
+ uint64_t block_size;
+};
+
+/**
+ * Private structure for stripe translator
+ */
+struct stripe_private {
+ struct stripe_options *pattern;
+ xlator_t **xl_array;
+ uint64_t block_size;
+ gf_lock_t lock;
+ uint8_t nodes_down;
+ int8_t first_child_down;
+ int8_t child_count;
+ int8_t state[256]; /* Current state of the child node,
+ 0 for down, 1 for up */
+ gf_boolean_t xattr_supported; /* 0 for no, 1 for yes, default yes */
+};
+
+/**
+ * Used to keep info about the replies received from fops->readv calls
+ */
+struct readv_replies {
+ struct iovec *vector;
+ int32_t count; //count of vector
+ int32_t op_ret; //op_ret of readv
+ int32_t op_errno;
+ struct stat stbuf; /* 'stbuf' is also a part of reply */
+};
+
+/**
+ * Local structure to be passed with all the frames in case of STACK_WIND
+ */
+struct stripe_local; /* this itself is used inside the structure; */
+
+struct stripe_local {
+ struct stripe_local *next;
+ call_frame_t *orig_frame;
+
+ /* Used by _cbk functions */
+ struct stat stbuf;
+ struct readv_replies *replies;
+ struct statvfs statvfs_buf;
+ dir_entry_t *entry;
+ struct xlator_stats stats;
+
+ int8_t revalidate;
+ int8_t failed;
+ int8_t unwind;
+
+ int32_t node_index;
+ int32_t call_count;
+ int32_t wind_count; /* used instead of child_cound
+ in case of read and write */
+ int32_t op_ret;
+ int32_t op_errno;
+ int32_t count;
+ int32_t flags;
+ char *name;
+ inode_t *inode;
+
+ loc_t loc;
+ loc_t loc2;
+
+ /* For File I/O fops */
+ dict_t *dict;
+
+ /* General usage */
+ off_t offset;
+ off_t stripe_size;
+
+ int8_t *list;
+ struct flock lock;
+ fd_t *fd;
+ void *value;
+};
+
+typedef struct stripe_local stripe_local_t;
+typedef struct stripe_private stripe_private_t;
+
+/**
+ * stripe_get_matching_bs - Get the matching block size for the given path.
+ */
+int32_t
+stripe_get_matching_bs (const char *path,
+ struct stripe_options *opts,
+ uint64_t default_bs)
+{
+ struct stripe_options *trav = NULL;
+ char *pathname = NULL;
+ uint64_t block_size = 0;
+
+ block_size = default_bs;
+ pathname = strdup (path);
+ trav = opts;
+
+ while (trav) {
+ if (fnmatch (trav->path_pattern,
+ pathname, FNM_NOESCAPE) == 0) {
+ block_size = trav->block_size;
+ break;
+ }
+ trav = trav->next;
+ }
+ free (pathname);
+
+ return block_size;
+}
+
+
+/*
+ * stripe_common_cbk -
+ */
+int32_t
+stripe_common_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+/**
+ * stripe_stack_unwind_cbk - This function is used for all the _cbk without
+ * any extra arguments (other than the minimum given)
+ * This is called from functions like fsync,unlink,rmdir etc.
+ *
+ */
+int32_t
+stripe_stack_unwind_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s returned %s",
+ ((call_frame_t *)cookie)->this->name,
+ strerror (op_errno));
+ local->op_errno = op_errno;
+ if (op_errno == ENOTCONN)
+ local->failed = 1;
+ }
+ if (op_ret >= 0)
+ local->op_ret = op_ret;
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (local->failed)
+ local->op_ret = -1;
+
+ if (local->loc.path)
+ loc_wipe (&local->loc);
+ if (local->loc2.path)
+ loc_wipe (&local->loc2);
+
+ STACK_UNWIND (frame, local->op_ret, local->op_errno);
+ }
+ return 0;
+}
+
+int32_t
+stripe_common_buf_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, buf);
+ return 0;
+}
+
+/**
+ * stripe_stack_unwind_buf_cbk - This function is used for all the _cbk with
+ * 'struct stat *buf' as extra argument (other than minimum)
+ * This is called from functions like, chmod, fchmod, chown, fchown,
+ * truncate, ftruncate, utimens etc.
+ *
+ * @cookie - this argument should be always 'xlator_t *' of child node
+ */
+int32_t
+stripe_stack_unwind_buf_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s returned error %s",
+ ((call_frame_t *)cookie)->this->name,
+ strerror (op_errno));
+ local->op_errno = op_errno;
+ if (op_errno == ENOTCONN)
+ local->failed = 1;
+ }
+
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ if (local->stbuf.st_blksize == 0) {
+ local->stbuf = *buf;
+ /* Because st_blocks gets added again */
+ local->stbuf.st_blocks = 0;
+ }
+
+ if (FIRST_CHILD(this) ==
+ ((call_frame_t *)cookie)->this) {
+ /* Always, pass the inode number of
+ first child to the above layer */
+ local->stbuf.st_ino = buf->st_ino;
+ local->stbuf.st_mtime = buf->st_mtime;
+ }
+
+ local->stbuf.st_blocks += buf->st_blocks;
+ if (local->stbuf.st_size < buf->st_size)
+ local->stbuf.st_size = buf->st_size;
+ if (local->stbuf.st_blksize != buf->st_blksize) {
+ /* TODO: add to blocks in terms of
+ original block size */
+ }
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (local->failed)
+ local->op_ret = -1;
+
+ if (local->loc.path)
+ loc_wipe (&local->loc);
+ if (local->loc2.path)
+ loc_wipe (&local->loc2);
+
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->stbuf);
+ }
+
+ return 0;
+}
+
+/* In case of symlink, mknod, the file is created on just first node */
+int32_t
+stripe_common_inode_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
+ return 0;
+}
+
+/**
+ * stripe_stack_unwind_inode_cbk - This is called by the function like,
+ * link (), symlink (), mkdir (), mknod ()
+ * This creates a inode for new inode. It keeps a list of all
+ * the inodes received from the child nodes. It is used while
+ * forwarding any fops to child nodes.
+ *
+ */
+int32_t
+stripe_stack_unwind_inode_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s returned error %s",
+ ((call_frame_t *)cookie)->this->name,
+ strerror (op_errno));
+ local->op_errno = op_errno;
+ if (op_errno == ENOTCONN)
+ local->failed = 1;
+ }
+
+ if (op_ret >= 0) {
+ local->op_ret = 0;
+
+ if (local->stbuf.st_blksize == 0) {
+ local->inode = inode;
+ local->stbuf = *buf;
+ /* Because st_blocks gets added again */
+ local->stbuf.st_blocks = 0;
+ }
+ if (FIRST_CHILD(this) ==
+ ((call_frame_t *)cookie)->this) {
+ local->stbuf.st_ino = buf->st_ino;
+ local->stbuf.st_mtime = buf->st_mtime;
+ }
+
+ local->stbuf.st_blocks += buf->st_blocks;
+ if (local->stbuf.st_size < buf->st_size)
+ local->stbuf.st_size = buf->st_size;
+ if (local->stbuf.st_blksize != buf->st_blksize) {
+ /* TODO: add to blocks in terms of
+ original block size */
+ }
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (local->failed)
+ local->op_ret = -1;
+
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf);
+ }
+
+ return 0;
+}
+
+int32_t
+stripe_stack_unwind_inode_lookup_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf,
+ dict_t *dict)
+{
+ int32_t callcnt = 0;
+ dict_t *tmp_dict = NULL;
+ stripe_local_t *local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ if (op_errno != ENOENT)
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s returned error %s",
+ ((call_frame_t *)cookie)->this->name,
+ strerror (op_errno));
+ local->op_errno = op_errno;
+ if (op_errno == ENOTCONN)
+ local->failed = 1;
+ }
+
+ if (op_ret >= 0) {
+ local->op_ret = 0;
+
+ if (local->stbuf.st_blksize == 0) {
+ local->inode = inode;
+ local->stbuf = *buf;
+ /* Because st_blocks gets added again */
+ local->stbuf.st_blocks = 0;
+ }
+ if (FIRST_CHILD(this) ==
+ ((call_frame_t *)cookie)->this) {
+ local->stbuf.st_ino = buf->st_ino;
+ local->stbuf.st_mtime = buf->st_mtime;
+ if (local->dict)
+ dict_unref (local->dict);
+ local->dict = dict_ref (dict);
+ } else {
+ if (!local->dict)
+ local->dict = dict_ref (dict);
+ }
+ local->stbuf.st_blocks += buf->st_blocks;
+ if (local->stbuf.st_size < buf->st_size)
+ local->stbuf.st_size = buf->st_size;
+ if (local->stbuf.st_blksize != buf->st_blksize) {
+ /* TODO: add to blocks in terms of
+ original block size */
+ }
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (local->failed)
+ local->op_ret = -1;
+
+ tmp_dict = local->dict;
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->dict);
+ if (tmp_dict)
+ dict_unref (tmp_dict);
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_lookup -
+ */
+int32_t
+stripe_lookup (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ dict_t *xattr_req)
+{
+ stripe_local_t *local = NULL;
+ xlator_list_t *trav = NULL;
+ stripe_private_t *priv = this->private;
+ char send_lookup_to_all = 0;
+
+ if (!(loc && loc->inode)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "wrong argument, returning EINVAL");
+ STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL);
+ return 0;
+ }
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+
+ if ((!loc->inode->st_mode) ||
+ S_ISDIR (loc->inode->st_mode) ||
+ S_ISREG (loc->inode->st_mode))
+ send_lookup_to_all = 1;
+
+ if (send_lookup_to_all) {
+ /* Everytime in stripe lookup, all child nodes
+ should be looked up */
+ local->call_count = priv->child_count;
+ trav = this->children;
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_inode_lookup_cbk,
+ trav->xlator,
+ trav->xlator->fops->lookup,
+ loc, xattr_req);
+ trav = trav->next;
+ }
+ } else {
+ local->call_count = 1;
+
+ STACK_WIND (frame,
+ stripe_stack_unwind_inode_lookup_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup,
+ loc, xattr_req);
+ }
+
+ return 0;
+}
+
+/**
+ * stripe_stat -
+ */
+int32_t
+stripe_stat (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ int send_lookup_to_all = 0;
+ xlator_list_t *trav = NULL;
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = this->private;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode))
+ send_lookup_to_all = 1;
+
+ if (!send_lookup_to_all) {
+ STACK_WIND (frame,
+ stripe_common_buf_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat,
+ loc);
+ } else {
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+ local->inode = loc->inode;
+ local->call_count = priv->child_count;
+
+ trav = this->children;
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_buf_cbk,
+ trav->xlator,
+ trav->xlator->fops->stat,
+ loc);
+ trav = trav->next;
+ }
+ }
+ return 0;
+}
+
+
+/**
+ * stripe_chmod -
+ */
+int32_t
+stripe_chmod (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ mode_t mode)
+{
+ int send_fop_to_all = 0;
+ xlator_list_t *trav = NULL;
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = this->private;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ if (priv->first_child_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning ENOTCONN");
+ STACK_UNWIND (frame, -1, ENOTCONN, NULL);
+ return 0;
+ }
+
+ if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode))
+ send_fop_to_all = 1;
+
+ if (!send_fop_to_all) {
+ STACK_WIND (frame,
+ stripe_common_buf_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->chmod,
+ loc, mode);
+ } else {
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+ local->inode = loc->inode;
+ local->call_count = priv->child_count;
+
+ trav = this->children;
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_buf_cbk,
+ trav->xlator,
+ trav->xlator->fops->chmod,
+ loc, mode);
+ trav = trav->next;
+ }
+ }
+ return 0;
+}
+
+
+/**
+ * stripe_chown -
+ */
+int32_t
+stripe_chown (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ uid_t uid,
+ gid_t gid)
+{
+ int send_fop_to_all = 0;
+ xlator_list_t *trav = NULL;
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = this->private;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ if (priv->first_child_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning ENOTCONN");
+ STACK_UNWIND (frame, -1, ENOTCONN, NULL);
+ return 0;
+ }
+
+ if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode))
+ send_fop_to_all = 1;
+
+ trav = this->children;
+ if (!send_fop_to_all) {
+ STACK_WIND (frame,
+ stripe_common_buf_cbk,
+ trav->xlator,
+ trav->xlator->fops->chown,
+ loc, uid, gid);
+ } else {
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+ local->inode = loc->inode;
+ local->call_count = priv->child_count;
+
+ trav = this->children;
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_buf_cbk,
+ trav->xlator,
+ trav->xlator->fops->chown,
+ loc, uid, gid);
+ trav = trav->next;
+ }
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_statfs_cbk -
+ */
+int32_t
+stripe_statfs_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct statvfs *stbuf)
+{
+ stripe_local_t *local = (stripe_local_t *)frame->local;
+ int32_t callcnt;
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret != 0 && op_errno != ENOTCONN) {
+ local->op_errno = op_errno;
+ }
+ if (op_ret == 0) {
+ struct statvfs *dict_buf = &local->statvfs_buf;
+ dict_buf->f_bsize = stbuf->f_bsize;
+ dict_buf->f_frsize = stbuf->f_frsize;
+ dict_buf->f_blocks += stbuf->f_blocks;
+ dict_buf->f_bfree += stbuf->f_bfree;
+ dict_buf->f_bavail += stbuf->f_bavail;
+ dict_buf->f_files += stbuf->f_files;
+ dict_buf->f_ffree += stbuf->f_ffree;
+ dict_buf->f_favail += stbuf->f_favail;
+ dict_buf->f_fsid = stbuf->f_fsid;
+ dict_buf->f_flag = stbuf->f_flag;
+ dict_buf->f_namemax = stbuf->f_namemax;
+ local->op_ret = 0;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ STACK_UNWIND (frame, local->op_ret,
+ local->op_errno, &local->statvfs_buf);
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_statfs -
+ */
+int32_t
+stripe_statfs (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ stripe_local_t *local = NULL;
+ xlator_list_t *trav = this->children;
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ frame->local = local;
+
+ local->call_count = ((stripe_private_t *)this->private)->child_count;
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_statfs_cbk,
+ trav->xlator,
+ trav->xlator->fops->statfs,
+ loc);
+ trav = trav->next;
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_truncate -
+ */
+int32_t
+stripe_truncate (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ off_t offset)
+{
+ int send_fop_to_all = 0;
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = this->private;
+ xlator_list_t *trav = this->children;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ if (priv->first_child_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning ENOTCONN");
+ STACK_UNWIND (frame, -1, ENOTCONN, NULL);
+ return 0;
+ }
+
+ if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode))
+ send_fop_to_all = 1;
+
+ if (!send_fop_to_all) {
+ STACK_WIND (frame,
+ stripe_common_buf_cbk,
+ trav->xlator,
+ trav->xlator->fops->truncate,
+ loc,
+ offset);
+ } else {
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+ local->inode = loc->inode;
+ local->call_count = priv->child_count;
+
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_buf_cbk,
+ trav->xlator,
+ trav->xlator->fops->truncate,
+ loc,
+ offset);
+ trav = trav->next;
+ }
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_utimens -
+ */
+int32_t
+stripe_utimens (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ struct timespec tv[2])
+{
+ int send_fop_to_all = 0;
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = this->private;
+ xlator_list_t *trav = this->children;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ if (priv->first_child_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning ENOTCONN");
+ STACK_UNWIND (frame, -1, ENOTCONN, NULL);
+ return 0;
+ }
+
+ if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode))
+ send_fop_to_all = 1;
+
+ if (!send_fop_to_all) {
+ STACK_WIND (frame,
+ stripe_common_buf_cbk,
+ trav->xlator,
+ trav->xlator->fops->utimens,
+ loc, tv);
+ } else {
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+ local->inode = loc->inode;
+ local->call_count = priv->child_count;
+
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_buf_cbk,
+ trav->xlator,
+ trav->xlator->fops->utimens,
+ loc, tv);
+ trav = trav->next;
+ }
+ }
+ return 0;
+}
+
+
+int32_t
+stripe_first_rename_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ stripe_local_t *local = frame->local;
+ xlator_list_t *trav = this->children;
+
+ if (op_ret == -1)
+ {
+ STACK_UNWIND (frame, op_ret, op_errno, buf);
+ return 0;
+ }
+
+ local->op_ret = 0;
+ local->stbuf = *buf;
+ local->call_count--;
+ trav = trav->next; /* Skip first child */
+
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_buf_cbk,
+ trav->xlator,
+ trav->xlator->fops->rename,
+ &local->loc, &local->loc2);
+ trav = trav->next;
+ }
+
+ return 0;
+}
+/**
+ * stripe_rename -
+ */
+int32_t
+stripe_rename (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *oldloc,
+ loc_t *newloc)
+{
+ stripe_private_t *priv = this->private;
+ stripe_local_t *local = NULL;
+ xlator_list_t *trav = this->children;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc);
+
+ if (priv->first_child_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning ENOTCONN");
+ STACK_UNWIND (frame, -1, EIO, NULL);
+ return 0;
+ }
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ local->inode = oldloc->inode;
+ loc_copy (&local->loc, oldloc);
+ loc_copy (&local->loc2, newloc);
+
+ local->call_count = priv->child_count;
+
+ frame->local = local;
+
+ STACK_WIND (frame,
+ stripe_first_rename_cbk,
+ trav->xlator,
+ trav->xlator->fops->rename,
+ oldloc, newloc);
+
+ return 0;
+}
+
+
+/**
+ * stripe_access -
+ */
+int32_t
+stripe_access (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t mask)
+{
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ STACK_WIND (frame,
+ stripe_common_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->access,
+ loc, mask);
+
+ return 0;
+}
+
+
+/**
+ * stripe_readlink_cbk -
+ */
+int32_t
+stripe_readlink_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ const char *path)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, path);
+
+ return 0;
+}
+
+
+/**
+ * stripe_readlink -
+ */
+int32_t
+stripe_readlink (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ size_t size)
+{
+ stripe_private_t *priv = this->private;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ if (priv->first_child_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning ENOTCONN");
+ STACK_UNWIND (frame, -1, ENOTCONN, NULL);
+ return 0;
+ }
+
+ STACK_WIND (frame,
+ stripe_readlink_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readlink,
+ loc, size);
+
+ return 0;
+}
+
+
+/**
+ * stripe_unlink -
+ */
+int32_t
+stripe_unlink (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ int send_fop_to_all = 0;
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = this->private;
+ xlator_list_t *trav = this->children;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ if (priv->first_child_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning EIO");
+ STACK_UNWIND (frame, -1, EIO);
+ return 0;
+ }
+
+ if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode))
+ send_fop_to_all = 1;
+
+ if (!send_fop_to_all) {
+ STACK_WIND (frame,
+ stripe_common_cbk,
+ trav->xlator,
+ trav->xlator->fops->unlink,
+ loc);
+ } else {
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+ local->call_count = priv->child_count;
+
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_cbk,
+ trav->xlator,
+ trav->xlator->fops->unlink,
+ loc);
+ trav = trav->next;
+ }
+ }
+
+ return 0;
+}
+
+
+int32_t
+stripe_first_rmdir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ xlator_list_t *trav = this->children;
+ stripe_local_t *local = frame->local;
+
+ if (op_ret == -1)
+ {
+ STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+ }
+
+ local->call_count--; /* First child successful */
+ trav = trav->next; /* Skip first child */
+
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_cbk,
+ trav->xlator,
+ trav->xlator->fops->rmdir,
+ &local->loc);
+ trav = trav->next;
+ }
+
+ return 0;
+}
+
+/**
+ * stripe_rmdir -
+ */
+int32_t
+stripe_rmdir (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ stripe_local_t *local = NULL;
+ xlator_list_t *trav = this->children;
+ stripe_private_t *priv = this->private;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ if (priv->first_child_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning EIO");
+ STACK_UNWIND (frame, -1, EIO);
+ return 0;
+ }
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+ local->inode = loc->inode;
+ loc_copy (&local->loc, loc);
+ local->call_count = priv->child_count;
+
+ STACK_WIND (frame,
+ stripe_first_rmdir_cbk,
+ trav->xlator,
+ trav->xlator->fops->rmdir,
+ loc);
+
+ return 0;
+}
+
+
+/**
+ * stripe_setxattr -
+ */
+int32_t
+stripe_setxattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ dict_t *dict,
+ int32_t flags)
+{
+ stripe_private_t *priv = this->private;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ if (priv->first_child_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning ENOTCONN");
+ STACK_UNWIND (frame, -1, ENOTCONN);
+ return 0;
+ }
+
+ STACK_WIND (frame,
+ stripe_common_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ loc, dict, flags);
+
+ return 0;
+}
+
+
+int32_t
+stripe_mknod_ifreg_fail_unlink_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ loc_wipe (&local->loc);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf);
+ }
+
+ return 0;
+}
+
+
+/**
+ */
+int32_t
+stripe_mknod_ifreg_setxattr_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = frame->local;
+ stripe_private_t *priv = this->private;
+ xlator_list_t *trav = this->children;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s returned error %s",
+ ((call_frame_t *)cookie)->this->name,
+ strerror (op_errno));
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (local->op_ret == -1) {
+ local->call_count = priv->child_count;
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_mknod_ifreg_fail_unlink_cbk,
+ trav->xlator,
+ trav->xlator->fops->unlink,
+ &local->loc);
+ trav = trav->next;
+ }
+ return 0;
+ }
+
+ loc_wipe (&local->loc);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf);
+ }
+ return 0;
+}
+
+/**
+ */
+int32_t
+stripe_mknod_ifreg_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf)
+{
+ int ret = 0;
+ int32_t callcnt = 0;
+ stripe_local_t *local = frame->local;
+ stripe_private_t *priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s returned error %s",
+ ((call_frame_t *)cookie)->this->name,
+ strerror (op_errno));
+ local->failed = 1;
+ local->op_errno = op_errno;
+ }
+
+ if (op_ret >= 0) {
+ local->op_ret = op_ret;
+ /* Get the mapping in inode private */
+ /* Get the stat buf right */
+ if (local->stbuf.st_blksize == 0) {
+ local->stbuf = *buf;
+ /* Because st_blocks gets added again */
+ local->stbuf.st_blocks = 0;
+ }
+
+ /* Always, pass the inode number of first child
+ to the above layer */
+ if (FIRST_CHILD(this) ==
+ ((call_frame_t *)cookie)->this)
+ local->stbuf.st_ino = buf->st_ino;
+
+ local->stbuf.st_blocks += buf->st_blocks;
+ if (local->stbuf.st_size < buf->st_size)
+ local->stbuf.st_size = buf->st_size;
+ if (local->stbuf.st_blksize != buf->st_blksize) {
+ /* TODO: add to blocks in terms of
+ original block size */
+ }
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (local->failed)
+ local->op_ret = -1;
+
+ if ((local->op_ret != -1) && priv->xattr_supported) {
+ /* Send a setxattr request to nodes where the
+ files are created */
+ int32_t index = 0;
+ char size_key[256] = {0,};
+ char index_key[256] = {0,};
+ char count_key[256] = {0,};
+ xlator_list_t *trav = this->children;
+ dict_t *dict = NULL;
+
+ sprintf (size_key,
+ "trusted.%s.stripe-size", this->name);
+ sprintf (count_key,
+ "trusted.%s.stripe-count", this->name);
+ sprintf (index_key,
+ "trusted.%s.stripe-index", this->name);
+
+ local->call_count = priv->child_count;
+
+ while (trav) {
+ dict = get_new_dict ();
+ dict_ref (dict);
+ /* TODO: check return value */
+ ret = dict_set_int64 (dict, size_key,
+ local->stripe_size);
+ ret = dict_set_int32 (dict, count_key,
+ local->call_count);
+ ret = dict_set_int32 (dict, index_key, index);
+
+ STACK_WIND (frame,
+ stripe_mknod_ifreg_setxattr_cbk,
+ trav->xlator,
+ trav->xlator->fops->setxattr,
+ &local->loc, dict, 0);
+
+ dict_unref (dict);
+ index++;
+ trav = trav->next;
+ }
+ } else {
+ /* Create itself has failed.. so return
+ without setxattring */
+ loc_wipe (&local->loc);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf);
+ }
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_mknod -
+ */
+int32_t
+stripe_mknod (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ mode_t mode,
+ dev_t rdev)
+{
+ stripe_private_t *priv = this->private;
+ stripe_local_t *local = NULL;
+ xlator_list_t *trav = NULL;
+
+ if (priv->first_child_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning EIO");
+ STACK_UNWIND (frame, -1, EIO, NULL, NULL);
+ return 0;
+ }
+
+ if (S_ISREG(mode)) {
+ /* NOTE: on older kernels (older than 2.6.9),
+ creat() fops is sent as mknod() + open(). Hence handling
+ S_IFREG files is necessary */
+ if (priv->nodes_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Some node down, returning EIO");
+ STACK_UNWIND (frame, -1, EIO, loc->inode, NULL);
+ return 0;
+ }
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ local->stripe_size = stripe_get_matching_bs (loc->path,
+ priv->pattern,
+ priv->block_size);
+ frame->local = local;
+ local->inode = loc->inode;
+ loc_copy (&local->loc, loc);
+
+ /* Everytime in stripe lookup, all child nodes should
+ be looked up */
+ local->call_count =
+ ((stripe_private_t *)this->private)->child_count;
+
+ trav = this->children;
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_mknod_ifreg_cbk,
+ trav->xlator,
+ trav->xlator->fops->mknod,
+ loc, mode, rdev);
+ trav = trav->next;
+ }
+
+ /* This case is handled, no need to continue further. */
+ return 0;
+ }
+
+
+ STACK_WIND (frame,
+ stripe_common_inode_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mknod,
+ loc, mode, rdev);
+
+ return 0;
+}
+
+
+/**
+ * stripe_mkdir -
+ */
+int32_t
+stripe_mkdir (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ mode_t mode)
+{
+ stripe_private_t *priv = this->private;
+ stripe_local_t *local = NULL;
+ xlator_list_t *trav = NULL;
+
+ if (priv->first_child_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning EIO");
+ STACK_UNWIND (frame, -1, EIO, NULL, NULL);
+ return 0;
+ }
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ local->call_count = priv->child_count;
+ frame->local = local;
+
+ /* Everytime in stripe lookup, all child nodes should be looked up */
+ trav = this->children;
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_inode_cbk,
+ trav->xlator,
+ trav->xlator->fops->mkdir,
+ loc, mode);
+ trav = trav->next;
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_symlink -
+ */
+int32_t
+stripe_symlink (call_frame_t *frame,
+ xlator_t *this,
+ const char *linkpath,
+ loc_t *loc)
+{
+ stripe_private_t *priv = this->private;
+
+ if (priv->first_child_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning EIO");
+ STACK_UNWIND (frame, -1, EIO, NULL, NULL);
+ return 0;
+ }
+
+ /* send symlink to only first node */
+ STACK_WIND (frame,
+ stripe_common_inode_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->symlink,
+ linkpath, loc);
+
+ return 0;
+}
+
+/**
+ * stripe_link -
+ */
+int32_t
+stripe_link (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *oldloc,
+ loc_t *newloc)
+{
+ int send_fop_to_all = 0;
+ stripe_private_t *priv = this->private;
+ stripe_local_t *local = NULL;
+ xlator_list_t *trav = this->children;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc);
+
+ if (priv->first_child_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning EIO");
+ STACK_UNWIND (frame, -1, EIO, NULL, NULL);
+ return 0;
+ }
+
+
+ if (S_ISREG (oldloc->inode->st_mode))
+ send_fop_to_all = 1;
+
+ if (!send_fop_to_all) {
+ STACK_WIND (frame,
+ stripe_common_inode_cbk,
+ trav->xlator,
+ trav->xlator->fops->link,
+ oldloc, newloc);
+ } else {
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+ local->call_count = priv->child_count;
+
+ /* Everytime in stripe lookup, all child
+ nodes should be looked up */
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_inode_cbk,
+ trav->xlator,
+ trav->xlator->fops->link,
+ oldloc, newloc);
+ trav = trav->next;
+ }
+ }
+
+ return 0;
+}
+
+int32_t
+stripe_create_fail_unlink_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int32_t callcnt = 0;
+ fd_t *lfd = NULL;
+ stripe_local_t *local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ lfd = local->fd;
+ loc_wipe (&local->loc);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->fd, local->inode, &local->stbuf);
+ fd_unref (lfd);
+ }
+ return 0;
+}
+
+
+/**
+ * stripe_create_setxattr_cbk -
+ */
+int32_t
+stripe_create_setxattr_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ fd_t *lfd = NULL;
+ int32_t callcnt = 0;
+ stripe_local_t *local = frame->local;
+ stripe_private_t *priv = this->private;
+ xlator_list_t *trav = this->children;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s returned error %s",
+ ((call_frame_t *)cookie)->this->name,
+ strerror (op_errno));
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (local->op_ret == -1) {
+ local->call_count = priv->child_count;
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_create_fail_unlink_cbk,
+ trav->xlator,
+ trav->xlator->fops->unlink,
+ &local->loc);
+ trav = trav->next;
+ }
+
+ return 0;
+ }
+
+ lfd = local->fd;
+ loc_wipe (&local->loc);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->fd, local->inode, &local->stbuf);
+ fd_unref (lfd);
+ }
+
+ return 0;
+}
+
+/**
+ * stripe_create_cbk -
+ */
+int32_t
+stripe_create_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd,
+ inode_t *inode,
+ struct stat *buf)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = frame->local;
+ stripe_private_t *priv = this->private;
+ fd_t *lfd = NULL;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s returned error %s",
+ ((call_frame_t *)cookie)->this->name,
+ strerror (op_errno));
+ local->failed = 1;
+ local->op_errno = op_errno;
+ }
+
+ if (op_ret >= 0) {
+ local->op_ret = op_ret;
+ /* Get the mapping in inode private */
+ /* Get the stat buf right */
+ if (local->stbuf.st_blksize == 0) {
+ local->stbuf = *buf;
+ /* Because st_blocks gets added again */
+ local->stbuf.st_blocks = 0;
+ }
+
+ /* Always, pass the inode number of first
+ child to the above layer */
+ if (FIRST_CHILD(this) ==
+ ((call_frame_t *)cookie)->this)
+ local->stbuf.st_ino = buf->st_ino;
+
+ local->stbuf.st_blocks += buf->st_blocks;
+ if (local->stbuf.st_size < buf->st_size)
+ local->stbuf.st_size = buf->st_size;
+ if (local->stbuf.st_blksize != buf->st_blksize) {
+ /* TODO: add to blocks in terms of
+ original block size */
+ }
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (local->failed)
+ local->op_ret = -1;
+
+ if (local->op_ret >= 0) {
+ fd_ctx_set (local->fd, this, local->stripe_size);
+ }
+
+ if ((local->op_ret != -1) &&
+ local->stripe_size && priv->xattr_supported) {
+ /* Send a setxattr request to nodes where
+ the files are created */
+ int ret = 0;
+ int32_t index = 0;
+ char size_key[256] = {0,};
+ char index_key[256] = {0,};
+ char count_key[256] = {0,};
+ xlator_list_t *trav = this->children;
+ dict_t *dict = NULL;
+
+ sprintf (size_key,
+ "trusted.%s.stripe-size", this->name);
+ sprintf (count_key,
+ "trusted.%s.stripe-count", this->name);
+ sprintf (index_key,
+ "trusted.%s.stripe-index", this->name);
+
+ local->call_count = priv->child_count;
+
+ while (trav) {
+ dict = get_new_dict ();
+ dict_ref (dict);
+
+ /* TODO: check return values */
+ ret = dict_set_int64 (dict, size_key,
+ local->stripe_size);
+ ret = dict_set_int32 (dict, count_key,
+ local->call_count);
+ ret = dict_set_int32 (dict, index_key, index);
+
+ STACK_WIND (frame,
+ stripe_create_setxattr_cbk,
+ trav->xlator,
+ trav->xlator->fops->setxattr,
+ &local->loc,
+ dict,
+ 0);
+
+ dict_unref (dict);
+ index++;
+ trav = trav->next;
+ }
+ } else {
+ /* Create itself has failed.. so return
+ without setxattring */
+ lfd = local->fd;
+ loc_wipe (&local->loc);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->fd, local->inode, &local->stbuf);
+
+ fd_unref (lfd);
+ }
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_create - If a block-size is specified for the 'name', create the
+ * file in all the child nodes. If not, create it in only first child.
+ *
+ * @name- complete path of the file to be created.
+ */
+int32_t
+stripe_create (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags,
+ mode_t mode,
+ fd_t *fd)
+{
+ stripe_private_t *priv = this->private;
+ stripe_local_t *local = NULL;
+ xlator_list_t *trav = NULL;
+
+ /* files created in O_APPEND mode does not allow lseek() on fd */
+ flags &= ~O_APPEND;
+
+ if (priv->first_child_down || priv->nodes_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning EIO");
+ STACK_UNWIND (frame, -1, EIO, fd, loc->inode, NULL);
+ return 0;
+ }
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ local->stripe_size = stripe_get_matching_bs (loc->path,
+ priv->pattern,
+ priv->block_size);
+ frame->local = local;
+ local->inode = loc->inode;
+ loc_copy (&local->loc, loc);
+ local->fd = fd_ref (fd);
+
+ local->call_count = ((stripe_private_t *)this->private)->child_count;
+
+ trav = this->children;
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_create_cbk,
+ trav->xlator,
+ trav->xlator->fops->create,
+ loc, flags, mode, fd);
+ trav = trav->next;
+ }
+
+ return 0;
+}
+
+/**
+ * stripe_open_cbk -
+ */
+int32_t
+stripe_open_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ local->failed = 1;
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s returned error %s",
+ ((call_frame_t *)cookie)->this->name,
+ strerror (op_errno));
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ }
+
+ if (op_ret >= 0)
+ local->op_ret = op_ret;
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (local->failed)
+ local->op_ret = -1;
+
+ if (local->op_ret >= 0) {
+ fd_ctx_set (local->fd, this, local->stripe_size);
+ }
+ loc_wipe (&local->loc);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno, fd);
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_getxattr_cbk -
+ */
+int32_t
+stripe_open_getxattr_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = frame->local;
+ xlator_list_t *trav = this->children;
+ stripe_private_t *priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s returned error %s",
+ ((call_frame_t *)cookie)->this->name,
+ strerror (op_errno));
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ if (op_errno == ENOTCONN)
+ local->failed = 1;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (!local->failed && (local->op_ret != -1)) {
+ /* If getxattr doesn't fails, call open */
+ char size_key[256] = {0,};
+ data_t *stripe_size_data = NULL;
+
+ sprintf (size_key,
+ "trusted.%s.stripe-size", this->name);
+ stripe_size_data = dict_get (dict, size_key);
+
+ if (stripe_size_data) {
+ local->stripe_size =
+ data_to_int64 (stripe_size_data);
+ /*
+ if (local->stripe_size != priv->block_size) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "file(%s) is having different "
+ "block-size", local->loc.path);
+ }
+ */
+ } else {
+ /* if the file was created using earlier
+ versions of stripe */
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "[CRITICAL] Seems like file(%s) "
+ "created using earlier version",
+ local->loc.path);
+ }
+ }
+
+ local->call_count = priv->child_count;
+
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_open_cbk,
+ trav->xlator,
+ trav->xlator->fops->open,
+ &local->loc, local->flags, local->fd);
+ trav = trav->next;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * stripe_open -
+ */
+int32_t
+stripe_open (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags,
+ fd_t *fd)
+{
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = this->private;
+ xlator_list_t *trav = this->children;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ /* files opened in O_APPEND mode does not allow lseek() on fd */
+ flags &= ~O_APPEND;
+
+ if (priv->first_child_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning ENOTCONN");
+ STACK_UNWIND (frame, -1, ENOTCONN, NULL);
+ return 0;
+ }
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->fd = fd;
+ frame->local = local;
+ local->inode = loc->inode;
+ loc_copy (&local->loc, loc);
+
+ /* Striped files */
+ local->flags = flags;
+ local->call_count = priv->child_count;
+ local->stripe_size = stripe_get_matching_bs (loc->path,
+ priv->pattern,
+ priv->block_size);
+
+ if (priv->xattr_supported) {
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_open_getxattr_cbk,
+ trav->xlator,
+ trav->xlator->fops->getxattr,
+ loc, NULL);
+ trav = trav->next;
+ }
+ } else {
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_open_cbk,
+ trav->xlator,
+ trav->xlator->fops->open,
+ &local->loc, local->flags, local->fd);
+ trav = trav->next;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * stripe_opendir_cbk -
+ */
+int32_t
+stripe_opendir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s returned error %s",
+ ((call_frame_t *)cookie)->this->name,
+ strerror (op_errno));
+ local->op_ret = -1;
+ local->failed = 1;
+ local->op_errno = op_errno;
+ }
+
+ if (op_ret >= 0)
+ local->op_ret = op_ret;
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ STACK_UNWIND (frame, local->op_ret,
+ local->op_errno, local->fd);
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_opendir -
+ */
+int32_t
+stripe_opendir (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ fd_t *fd)
+{
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = this->private;
+ xlator_list_t *trav = this->children;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ if (priv->first_child_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning EIO");
+ STACK_UNWIND (frame, -1, EIO, NULL);
+ return 0;
+ }
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ frame->local = local;
+ local->inode = loc->inode;
+ local->fd = fd;
+ local->call_count = priv->child_count;
+
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_opendir_cbk,
+ trav->xlator,
+ trav->xlator->fops->opendir,
+ loc, fd);
+ trav = trav->next;
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_getxattr_cbk -
+ */
+int32_t
+stripe_getxattr_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *value)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, value);
+ return 0;
+}
+
+
+/**
+ * stripe_getxattr -
+ */
+int32_t
+stripe_getxattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ const char *name)
+{
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ STACK_WIND (frame,
+ stripe_getxattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ loc, name);
+
+ return 0;
+}
+
+/**
+ * stripe_removexattr -
+ */
+int32_t
+stripe_removexattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ const char *name)
+{
+ stripe_private_t *priv = this->private;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ if (priv->first_child_down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "First node down, returning ENOTCONN");
+ STACK_UNWIND (frame, -1, ENOTCONN, NULL);
+ return 0;
+ }
+
+ STACK_WIND (frame,
+ stripe_common_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr,
+ loc, name);
+
+ return 0;
+}
+
+
+/**
+ * stripe_lk_cbk -
+ */
+int32_t
+stripe_lk_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct flock *lock)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s returned error %s",
+ ((call_frame_t *)cookie)->this->name,
+ strerror (op_errno));
+ local->op_errno = op_errno;
+ if (op_errno == ENOTCONN)
+ local->failed = 1;
+ }
+ if (op_ret == 0 && local->op_ret == -1) {
+ /* First successful call, copy the *lock */
+ local->op_ret = 0;
+ local->lock = *lock;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (local->failed)
+ local->op_ret = -1;
+ STACK_UNWIND (frame, local->op_ret,
+ local->op_errno, &local->lock);
+ }
+ return 0;
+}
+
+
+/**
+ * stripe_lk -
+ */
+int32_t
+stripe_lk (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t cmd,
+ struct flock *lock)
+{
+ stripe_local_t *local = NULL;
+ xlator_list_t *trav = this->children;
+ stripe_private_t *priv = this->private;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+
+ local->call_count = priv->child_count;
+
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_lk_cbk,
+ trav->xlator,
+ trav->xlator->fops->lk,
+ fd, cmd, lock);
+ trav = trav->next;
+ }
+
+ return 0;
+}
+
+/**
+ * stripe_writedir -
+ */
+int32_t
+stripe_setdents (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t flags,
+ dir_entry_t *entries,
+ int32_t count)
+{
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = this->private;
+ xlator_list_t *trav = this->children;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+ local->call_count = priv->child_count;
+
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_cbk,
+ trav->xlator,
+ trav->xlator->fops->setdents,
+ fd, flags, entries, count);
+ trav = trav->next;
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_flush -
+ */
+int32_t
+stripe_flush (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd)
+{
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = this->private;
+ xlator_list_t *trav = this->children;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+ local->call_count = priv->child_count;
+
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_cbk,
+ trav->xlator,
+ trav->xlator->fops->flush,
+ fd);
+ trav = trav->next;
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_close -
+ */
+int32_t
+stripe_release (xlator_t *this,
+ fd_t *fd)
+{
+ return 0;
+}
+
+
+/**
+ * stripe_fsync -
+ */
+int32_t
+stripe_fsync (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t flags)
+{
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = this->private;
+ xlator_list_t *trav = this->children;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+ local->call_count = priv->child_count;
+
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_cbk,
+ trav->xlator,
+ trav->xlator->fops->fsync,
+ fd, flags);
+ trav = trav->next;
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_fstat -
+ */
+int32_t
+stripe_fstat (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd)
+{
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = this->private;
+ xlator_list_t *trav = this->children;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+ local->inode = fd->inode;
+ local->call_count = priv->child_count;
+
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_buf_cbk,
+ trav->xlator,
+ trav->xlator->fops->fstat,
+ fd);
+ trav = trav->next;
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_fchmod -
+ */
+int32_t
+stripe_fchmod (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ mode_t mode)
+{
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = this->private;
+ xlator_list_t *trav = this->children;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+ local->inode = fd->inode;
+ local->call_count = priv->child_count;
+
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_buf_cbk,
+ trav->xlator,
+ trav->xlator->fops->fchmod,
+ fd, mode);
+ trav = trav->next;
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_fchown -
+ */
+int32_t
+stripe_fchown (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ uid_t uid,
+ gid_t gid)
+{
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = this->private;
+ xlator_list_t *trav = this->children;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+ local->inode = fd->inode;
+ local->call_count = priv->child_count;
+
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_buf_cbk,
+ trav->xlator,
+ trav->xlator->fops->fchown,
+ fd, uid, gid);
+ trav = trav->next;
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_ftruncate -
+ */
+int32_t
+stripe_ftruncate (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ off_t offset)
+{
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = this->private;
+ xlator_list_t *trav = this->children;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+ local->inode = fd->inode;
+ local->call_count = priv->child_count;
+
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_buf_cbk,
+ trav->xlator,
+ trav->xlator->fops->ftruncate,
+ fd, offset);
+ trav = trav->next;
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_releasedir -
+ */
+int32_t
+stripe_releasedir (xlator_t *this,
+ fd_t *fd)
+{
+ return 0;
+}
+
+
+/**
+ * stripe_fsyncdir -
+ */
+int32_t
+stripe_fsyncdir (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t flags)
+{
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = this->private;
+ xlator_list_t *trav = this->children;
+
+ STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
+
+ /* Initialization */
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->op_ret = -1;
+ frame->local = local;
+ local->call_count = priv->child_count;
+
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stack_unwind_cbk,
+ trav->xlator,
+ trav->xlator->fops->fsyncdir,
+ fd,
+ flags);
+ trav = trav->next;
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_single_readv_cbk - This function is used as return fn, when the
+ * file name doesn't match the pattern specified for striping.
+ */
+int32_t
+stripe_single_readv_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vector,
+ int32_t count,
+ struct stat *stbuf)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf);
+ return 0;
+}
+
+/**
+ * stripe_readv_cbk - get all the striped reads, and order it properly, send it
+ * to above layer after putting it in a single vector.
+ */
+int32_t
+stripe_readv_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vector,
+ int32_t count,
+ struct stat *stbuf)
+{
+ int32_t index = 0;
+ int32_t callcnt = 0;
+ call_frame_t *main_frame = NULL;
+ stripe_local_t *main_local = NULL;
+ stripe_local_t *local = frame->local;
+
+ index = local->node_index;
+ main_frame = local->orig_frame;
+ main_local = main_frame->local;
+
+ LOCK (&main_frame->lock);
+ {
+ main_local->replies[index].op_ret = op_ret;
+ main_local->replies[index].op_errno = op_errno;
+ if (op_ret >= 0) {
+ main_local->replies[index].stbuf = *stbuf;
+ main_local->replies[index].count = count;
+ main_local->replies[index].vector =
+ iov_dup (vector, count);
+
+ if (frame->root->rsp_refs)
+ dict_copy (frame->root->rsp_refs,
+ main_frame->root->rsp_refs);
+ }
+ callcnt = ++main_local->call_count;
+ }
+ UNLOCK(&main_frame->lock);
+
+ if (callcnt == main_local->wind_count) {
+ int32_t final_count = 0;
+ struct iovec *final_vec = NULL;
+ struct stat tmp_stbuf = {0,};
+ dict_t *refs = main_frame->root->rsp_refs;
+
+ op_ret = 0;
+ memcpy (&tmp_stbuf, &main_local->replies[0].stbuf,
+ sizeof (struct stat));
+ for (index=0; index < main_local->wind_count; index++) {
+ /* TODO: check whether each stripe returned 'expected'
+ * number of bytes
+ */
+ if (main_local->replies[index].op_ret == -1) {
+ op_ret = -1;
+ op_errno = main_local->replies[index].op_errno;
+ break;
+ }
+ op_ret += main_local->replies[index].op_ret;
+ final_count += main_local->replies[index].count;
+ /* TODO: Do I need to send anything more in stbuf? */
+ if (tmp_stbuf.st_size <
+ main_local->replies[index].stbuf.st_size) {
+ tmp_stbuf.st_size =
+ main_local->replies[index].stbuf.st_size;
+ }
+ }
+ if (op_ret != -1) {
+ final_vec = CALLOC (final_count,
+ sizeof (struct iovec));
+ ERR_ABORT (final_vec);
+ final_count = 0;
+
+ for (index=0;
+ index < main_local->wind_count; index++) {
+ memcpy (final_vec + final_count,
+ main_local->replies[index].vector,
+ (main_local->replies[index].count *
+ sizeof (struct iovec)));
+ final_count +=
+ main_local->replies[index].count;
+
+ free (main_local->replies[index].vector);
+ }
+ } else {
+ final_vec = NULL;
+ final_count = 0;
+ }
+ /* */
+ FREE (main_local->replies);
+ refs = main_frame->root->rsp_refs;
+ STACK_UNWIND (main_frame, op_ret, op_errno,
+ final_vec, final_count, &tmp_stbuf);
+
+ dict_unref (refs);
+ if (final_vec)
+ free (final_vec);
+ }
+
+ STACK_DESTROY (frame->root);
+ return 0;
+}
+
+/**
+ * stripe_readv -
+ */
+int32_t
+stripe_readv (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t offset)
+{
+ int32_t index = 0;
+ int32_t num_stripe = 0;
+ size_t frame_size = 0;
+ off_t rounded_end = 0;
+ uint64_t stripe_size = 0;
+ off_t rounded_start = 0;
+ off_t frame_offset = offset;
+ stripe_local_t *local = NULL;
+ call_frame_t *rframe = NULL;
+ stripe_local_t *rlocal = NULL;
+ xlator_list_t *trav = this->children;
+ stripe_private_t *priv = this->private;
+
+ fd_ctx_get (fd, this, &stripe_size);
+ if (!stripe_size) {
+ STACK_UNWIND (frame, -1, EINVAL, NULL, 0, NULL);
+ return 0;
+ }
+
+ /* The file is stripe across the child nodes. Send the read request
+ * to the child nodes appropriately after checking which region of
+ * the file is in which child node. Always '0-<stripe_size>' part of
+ * the file resides in the first child.
+ */
+ rounded_start = floor (offset, stripe_size);
+ rounded_end = roof (offset+size, stripe_size);
+ num_stripe = (rounded_end - rounded_start) / stripe_size;
+
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ local->wind_count = num_stripe;
+ frame->local = local;
+ frame->root->rsp_refs = dict_ref (get_new_dict ());
+
+ /* This is where all the vectors should be copied. */
+ local->replies = CALLOC (1, num_stripe *
+ sizeof (struct readv_replies));
+ ERR_ABORT (local->replies);
+
+ for (index = 0;
+ index < ((offset / stripe_size) % priv->child_count);
+ index++) {
+ trav = trav->next;
+ }
+
+ for (index = 0; index < num_stripe; index++) {
+ rframe = copy_frame (frame);
+ rlocal = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (rlocal);
+
+ frame_size = min (roof (frame_offset+1, stripe_size),
+ (offset + size)) - frame_offset;
+
+ rlocal->node_index = index;
+ rlocal->orig_frame = frame;
+ rframe->local = rlocal;
+ STACK_WIND (rframe,
+ stripe_readv_cbk,
+ trav->xlator,
+ trav->xlator->fops->readv,
+ fd, frame_size, frame_offset);
+
+ frame_offset += frame_size;
+
+ trav = trav->next ? trav->next : this->children;
+ }
+
+ return 0;
+}
+
+
+/**
+ * stripe_writev_cbk -
+ */
+int32_t
+stripe_writev_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *stbuf)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = frame->local;
+ LOCK(&frame->lock);
+ {
+ callcnt = ++local->call_count;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s returned error %s",
+ ((call_frame_t *)cookie)->this->name,
+ strerror (op_errno));
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ }
+ if (op_ret >= 0) {
+ local->op_ret += op_ret;
+ local->stbuf = *stbuf;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if ((callcnt == local->wind_count) && local->unwind) {
+ STACK_UNWIND (frame, local->op_ret,
+ local->op_errno, &local->stbuf);
+ }
+ return 0;
+}
+
+
+/**
+ * stripe_single_writev_cbk -
+ */
+int32_t
+stripe_single_writev_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *stbuf)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, stbuf);
+ return 0;
+}
+/**
+ * stripe_writev -
+ */
+int32_t
+stripe_writev (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ struct iovec *vector,
+ int32_t count,
+ off_t offset)
+{
+ int32_t idx = 0;
+ int32_t total_size = 0;
+ int32_t offset_offset = 0;
+ int32_t remaining_size = 0;
+ int32_t tmp_count = count;
+ off_t fill_size = 0;
+ uint64_t stripe_size = 0;
+ struct iovec *tmp_vec = vector;
+ stripe_private_t *priv = this->private;
+ stripe_local_t *local = NULL;
+ xlator_list_t *trav = NULL;
+
+ fd_ctx_get (fd, this, &stripe_size);
+ if (!stripe_size) {
+ STACK_UNWIND (frame, -1, EINVAL, NULL);
+ return 0;
+ }
+
+ /* File has to be stripped across the child nodes */
+ for (idx = 0; idx< count; idx ++) {
+ total_size += tmp_vec[idx].iov_len;
+ }
+ remaining_size = total_size;
+
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ frame->local = local;
+ local->stripe_size = stripe_size;
+
+ while (1) {
+ /* Send striped chunk of the vector to child
+ nodes appropriately. */
+ trav = this->children;
+
+ idx = (((offset + offset_offset) /
+ local->stripe_size) % priv->child_count);
+ while (idx) {
+ trav = trav->next;
+ idx--;
+ }
+ fill_size = (local->stripe_size -
+ ((offset + offset_offset) % local->stripe_size));
+ if (fill_size > remaining_size)
+ fill_size = remaining_size;
+
+ remaining_size -= fill_size;
+
+ tmp_count = iov_subset (vector, count, offset_offset,
+ offset_offset + fill_size, NULL);
+ tmp_vec = CALLOC (tmp_count, sizeof (struct iovec));
+ ERR_ABORT (tmp_vec);
+ tmp_count = iov_subset (vector, count, offset_offset,
+ offset_offset + fill_size, tmp_vec);
+
+ local->wind_count++;
+ if (remaining_size == 0)
+ local->unwind = 1;
+
+ STACK_WIND(frame,
+ stripe_writev_cbk,
+ trav->xlator,
+ trav->xlator->fops->writev,
+ fd, tmp_vec, tmp_count, offset + offset_offset);
+ FREE (tmp_vec);
+ offset_offset += fill_size;
+ if (remaining_size == 0)
+ break;
+ }
+
+ return 0;
+}
+
+
+
+/* Management operations */
+
+/**
+ * stripe_stats_cbk - Add all the fields received from different clients.
+ * Once all the clients return, send stats to above layer.
+ *
+ */
+int32_t
+stripe_stats_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct xlator_stats *stats)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = frame->local;
+
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s returned error %s",
+ ((call_frame_t *)cookie)->this->name,
+ strerror (op_errno));
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ }
+ if (op_ret == 0) {
+ if (local->op_ret == -2) {
+ /* This is to make sure this is the
+ first time */
+ local->stats = *stats;
+ local->op_ret = 0;
+ } else {
+ local->stats.nr_files += stats->nr_files;
+ local->stats.free_disk += stats->free_disk;
+ local->stats.disk_usage += stats->disk_usage;
+ local->stats.nr_clients += stats->nr_clients;
+ }
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->stats);
+ }
+
+ return 0;
+}
+
+/**
+ * stripe_stats -
+ */
+int32_t
+stripe_stats (call_frame_t *frame,
+ xlator_t *this,
+ int32_t flags)
+{
+ stripe_local_t *local = NULL;
+ xlator_list_t *trav = this->children;
+
+ local = CALLOC (1, sizeof (stripe_local_t));
+ ERR_ABORT (local);
+ frame->local = local;
+ local->op_ret = -2; /* to be used as a flag in _cbk */
+ local->call_count = ((stripe_private_t*)this->private)->child_count;
+ while (trav) {
+ STACK_WIND (frame,
+ stripe_stats_cbk,
+ trav->xlator,
+ trav->xlator->mops->stats,
+ flags);
+ trav = trav->next;
+ }
+ return 0;
+}
+
+/**
+ * notify
+ */
+int32_t
+notify (xlator_t *this,
+ int32_t event,
+ void *data,
+ ...)
+{
+ stripe_private_t *priv = this->private;
+ int down_client = 0;
+ int i = 0;
+
+ if (!priv)
+ return 0;
+
+ switch (event)
+ {
+ case GF_EVENT_CHILD_UP:
+ {
+ /* get an index number to set */
+ for (i = 0; i < priv->child_count; i++) {
+ if (data == priv->xl_array[i])
+ break;
+ }
+ priv->state[i] = 1;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->state[i])
+ down_client++;
+ }
+
+ LOCK (&priv->lock);
+ {
+ priv->nodes_down = down_client;
+
+ if (data == FIRST_CHILD (this)) {
+ priv->first_child_down = 0;
+ default_notify (this, event, data);
+ }
+ }
+ UNLOCK (&priv->lock);
+ }
+ break;
+ case GF_EVENT_CHILD_DOWN:
+ {
+ /* get an index number to set */
+ for (i = 0; i < priv->child_count; i++) {
+ if (data == priv->xl_array[i])
+ break;
+ }
+ priv->state[i] = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->state[i])
+ down_client++;
+ }
+
+ LOCK (&priv->lock);
+ {
+ priv->nodes_down = down_client;
+
+ if (data == FIRST_CHILD (this)) {
+ priv->first_child_down = 1;
+ default_notify (this, event, data);
+ }
+ }
+ UNLOCK (&priv->lock);
+ }
+ break;
+
+ default:
+ {
+ /* */
+ default_notify (this, event, data);
+ }
+ break;
+ }
+
+ return 0;
+}
+/**
+ * init - This function is called when xlator-graph gets initialized.
+ * The option given in volfiles are parsed here.
+ * @this -
+ */
+int32_t
+init (xlator_t *this)
+{
+ stripe_private_t *priv = NULL;
+ xlator_list_t *trav = NULL;
+ data_t *data = NULL;
+ int32_t count = 0;
+
+ trav = this->children;
+ while (trav) {
+ count++;
+ trav = trav->next;
+ }
+
+ if (!count) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "stripe configured without \"subvolumes\" option. "
+ "exiting");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ priv = CALLOC (1, sizeof (stripe_private_t));
+ ERR_ABORT (priv);
+ priv->xl_array = CALLOC (1, count * sizeof (xlator_t *));
+ ERR_ABORT (priv->xl_array);
+ priv->child_count = count;
+ LOCK_INIT (&priv->lock);
+
+ trav = this->children;
+ count = 0;
+ while (trav) {
+ priv->xl_array[count++] = trav->xlator;
+ trav = trav->next;
+ }
+
+ if (count > 256) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "maximum number of stripe subvolumes supported "
+ "is 256");
+ return -1;
+ }
+
+ priv->block_size = (128 * GF_UNIT_KB);
+ /* option stripe-pattern *avi:1GB,*pdf:4096 */
+ data = dict_get (this->options, "block-size");
+ if (!data) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "No \"option block-size <x>\" given, defaulting "
+ "to 128KB");
+ } else {
+ char *tmp_str = NULL;
+ char *tmp_str1 = NULL;
+ char *dup_str = NULL;
+ char *stripe_str = NULL;
+ char *pattern = NULL;
+ char *num = NULL;
+ struct stripe_options *temp_stripeopt = NULL;
+ struct stripe_options *stripe_opt = NULL;
+
+ /* Get the pattern for striping.
+ "option block-size *avi:10MB" etc */
+ stripe_str = strtok_r (data->data, ",", &tmp_str);
+ while (stripe_str) {
+ dup_str = strdup (stripe_str);
+ stripe_opt = CALLOC (1,
+ sizeof (struct stripe_options));
+ ERR_ABORT (stripe_opt);
+ pattern = strtok_r (dup_str, ":", &tmp_str1);
+ num = strtok_r (NULL, ":", &tmp_str1);
+ if (num &&
+ (gf_string2bytesize (num,
+ &stripe_opt->block_size)
+ != 0)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "invalid number format \"%s\"",
+ num);
+ return -1;
+ } else if (!num && (gf_string2bytesize (
+ pattern,
+ &stripe_opt->block_size)
+ != 0)) {
+ /* Possible that there is no pattern given */
+ stripe_opt->block_size = (128 * GF_UNIT_KB);
+ pattern = "*";
+ }
+ memcpy (stripe_opt->path_pattern,
+ pattern, strlen (pattern));
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "block-size : pattern %s : size %"PRId64,
+ stripe_opt->path_pattern,
+ stripe_opt->block_size);
+
+ if (!priv->pattern) {
+ priv->pattern = stripe_opt;
+ } else {
+ temp_stripeopt = priv->pattern;
+ while (temp_stripeopt->next)
+ temp_stripeopt = temp_stripeopt->next;
+ temp_stripeopt->next = stripe_opt;
+ }
+ stripe_str = strtok_r (NULL, ",", &tmp_str);
+ }
+ }
+
+ priv->xattr_supported = 1;
+ data = dict_get (this->options, "use-xattr");
+ if (data) {
+ if (gf_string2boolean (data->data,
+ &priv->xattr_supported) == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "error setting hard check for extended "
+ "attribute");
+ //return -1;
+ }
+ }
+
+ /* notify related */
+ priv->nodes_down = priv->child_count;
+ this->private = priv;
+
+ return 0;
+}
+
+/**
+ * fini - Free all the private variables
+ * @this -
+ */
+void
+fini (xlator_t *this)
+{
+ stripe_private_t *priv = this->private;
+ struct stripe_options *prev = NULL;
+ struct stripe_options *trav = priv->pattern;
+ while (trav) {
+ prev = trav;
+ trav = trav->next;
+ FREE (prev);
+ }
+ FREE (priv->xl_array);
+ LOCK_DESTROY (&priv->lock);
+ FREE (priv);
+ return;
+}
+
+
+struct xlator_fops fops = {
+ .stat = stripe_stat,
+ .unlink = stripe_unlink,
+ .symlink = stripe_symlink,
+ .rename = stripe_rename,
+ .link = stripe_link,
+ .chmod = stripe_chmod,
+ .chown = stripe_chown,
+ .truncate = stripe_truncate,
+ .utimens = stripe_utimens,
+ .create = stripe_create,
+ .open = stripe_open,
+ .readv = stripe_readv,
+ .writev = stripe_writev,
+ .statfs = stripe_statfs,
+ .flush = stripe_flush,
+ .fsync = stripe_fsync,
+ .setxattr = stripe_setxattr,
+ .getxattr = stripe_getxattr,
+ .removexattr = stripe_removexattr,
+ .access = stripe_access,
+ .ftruncate = stripe_ftruncate,
+ .fstat = stripe_fstat,
+ .readlink = stripe_readlink,
+ .mkdir = stripe_mkdir,
+ .rmdir = stripe_rmdir,
+ .lk = stripe_lk,
+ .opendir = stripe_opendir,
+ .fsyncdir = stripe_fsyncdir,
+ .fchmod = stripe_fchmod,
+ .fchown = stripe_fchown,
+ .lookup = stripe_lookup,
+ .setdents = stripe_setdents,
+ .mknod = stripe_mknod,
+};
+
+struct xlator_mops mops = {
+ .stats = stripe_stats,
+};
+
+struct xlator_cbks cbks = {
+ .release = stripe_release,
+ .releasedir = stripe_releasedir
+};
+
+
+struct volume_options options[] = {
+ { .key = {"block-size"},
+ .type = GF_OPTION_TYPE_ANY
+ },
+ { .key = {"use-xattr"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/unify/Makefile.am b/xlators/cluster/unify/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/cluster/unify/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/unify/src/Makefile.am b/xlators/cluster/unify/src/Makefile.am
new file mode 100644
index 000000000..b9e6f63e9
--- /dev/null
+++ b/xlators/cluster/unify/src/Makefile.am
@@ -0,0 +1,16 @@
+
+xlator_LTLIBRARIES = unify.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+unify_la_LDFLAGS = -module -avoidversion
+
+unify_la_SOURCES = unify.c unify-self-heal.c
+unify_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = unify.h
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES =
+
diff --git a/xlators/cluster/unify/src/unify-self-heal.c b/xlators/cluster/unify/src/unify-self-heal.c
new file mode 100644
index 000000000..4885dd91a
--- /dev/null
+++ b/xlators/cluster/unify/src/unify-self-heal.c
@@ -0,0 +1,1225 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * unify-self-heal.c :
+ * This file implements few functions which enables 'unify' translator
+ * to be consistent in its behaviour when
+ * > a node fails,
+ * > a node gets added,
+ * > a failed node comes back
+ * > a new namespace server is added (ie, an fresh namespace server).
+ *
+ * This functionality of 'unify' will enable glusterfs to support storage
+ * system failure, and maintain consistancy. This works both ways, ie, when
+ * an entry (either file or directory) is found on namespace server, and not
+ * on storage nodes, its created in storage nodes and vica-versa.
+ *
+ * The two fops, where it can be implemented are 'getdents ()' and 'lookup ()'
+ *
+ */
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "unify.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "common-utils.h"
+
+int32_t
+unify_sh_getdents_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dir_entry_t *entry,
+ int32_t count);
+
+int32_t
+unify_sh_ns_getdents_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dir_entry_t *entry,
+ int32_t count);
+
+int32_t
+unify_bgsh_getdents_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dir_entry_t *entry,
+ int32_t count);
+
+int32_t
+unify_bgsh_ns_getdents_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dir_entry_t *entry,
+ int32_t count);
+
+/**
+ * unify_local_wipe - free all the extra allocation of local->* here.
+ */
+static void
+unify_local_wipe (unify_local_t *local)
+{
+ /* Free the strdup'd variables in the local structure */
+ if (local->name) {
+ FREE (local->name);
+ }
+
+ if (local->sh_struct) {
+ if (local->sh_struct->offset_list)
+ FREE (local->sh_struct->offset_list);
+
+ if (local->sh_struct->entry_list)
+ FREE (local->sh_struct->entry_list);
+
+ if (local->sh_struct->count_list)
+ FREE (local->sh_struct->count_list);
+
+ FREE (local->sh_struct);
+ }
+
+ loc_wipe (&local->loc1);
+ loc_wipe (&local->loc2);
+}
+
+int32_t
+unify_sh_setdents_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int32_t callcnt = -1;
+ unify_local_t *local = frame->local;
+ inode_t *inode = NULL;
+ dict_t *tmp_dict = NULL;
+ dir_entry_t *prev, *entry, *trav;
+
+ LOCK (&frame->lock);
+ {
+ /* if local->call_count == 0, that means, setdents on
+ * storagenodes is still pending.
+ */
+ if (local->call_count)
+ callcnt = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (callcnt == 0) {
+ if (local->sh_struct->entry_list[0]) {
+ prev = entry = local->sh_struct->entry_list[0];
+ if (!entry)
+ return 0;
+ trav = entry->next;
+ while (trav) {
+ prev->next = trav->next;
+ FREE (trav->name);
+ if (S_ISLNK (trav->buf.st_mode))
+ FREE (trav->link);
+ FREE (trav);
+ trav = prev->next;
+ }
+ FREE (entry);
+ }
+
+ if (!local->flags) {
+ if (local->sh_struct->count_list[0] >=
+ UNIFY_SELF_HEAL_GETDENTS_COUNT) {
+ /* count == size, that means, there are more entries
+ to read from */
+ //local->call_count = 0;
+ local->sh_struct->offset_list[0] +=
+ UNIFY_SELF_HEAL_GETDENTS_COUNT;
+ STACK_WIND (frame,
+ unify_sh_ns_getdents_cbk,
+ NS(this),
+ NS(this)->fops->getdents,
+ local->fd,
+ UNIFY_SELF_HEAL_GETDENTS_COUNT,
+ local->sh_struct->offset_list[0],
+ GF_GET_DIR_ONLY);
+ }
+ } else {
+ inode = local->loc1.inode;
+ fd_unref (local->fd);
+ tmp_dict = local->dict;
+
+ unify_local_wipe (local);
+
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ inode, &local->stbuf, local->dict);
+ if (tmp_dict)
+ dict_unref (local->dict);
+ }
+ }
+
+ return 0;
+}
+
+
+int32_t
+unify_sh_ns_getdents_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dir_entry_t *entry,
+ int32_t count)
+{
+ unify_local_t *local = frame->local;
+ unify_private_t *priv = this->private;
+ long index = 0;
+ unsigned long final = 0;
+ dir_entry_t *tmp = CALLOC (1, sizeof (dir_entry_t));
+
+ local->sh_struct->entry_list[0] = tmp;
+ local->sh_struct->count_list[0] = count;
+ if (entry) {
+ tmp->next = entry->next;
+ entry->next = NULL;
+ }
+
+ if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) {
+ final = 1;
+ }
+
+ LOCK (&frame->lock);
+ {
+ /* local->call_count will be '0' till now. make it 1 so, it
+ can be UNWIND'ed for the last call. */
+ local->call_count = priv->child_count;
+ if (final)
+ local->flags = 1;
+ }
+ UNLOCK (&frame->lock);
+
+ for (index = 0; index < priv->child_count; index++)
+ {
+ STACK_WIND_COOKIE (frame,
+ unify_sh_setdents_cbk,
+ (void *)index,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->setdents,
+ local->fd, GF_SET_DIR_ONLY,
+ local->sh_struct->entry_list[0], count);
+ }
+
+ return 0;
+}
+
+int32_t
+unify_sh_ns_setdents_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int32_t callcnt = -1;
+ unify_local_t *local = frame->local;
+ unify_private_t *priv = this->private;
+ long index = (long)cookie;
+ dir_entry_t *prev, *entry, *trav;
+
+ LOCK (&frame->lock);
+ {
+ if (local->sh_struct->entry_list[index]) {
+ prev = entry = local->sh_struct->entry_list[index];
+ trav = entry->next;
+ while (trav) {
+ prev->next = trav->next;
+ FREE (trav->name);
+ if (S_ISLNK (trav->buf.st_mode))
+ FREE (trav->link);
+ FREE (trav);
+ trav = prev->next;
+ }
+ FREE (entry);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (local->sh_struct->count_list[index] <
+ UNIFY_SELF_HEAL_GETDENTS_COUNT) {
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+ } else {
+ /* count == size, that means, there are more entries
+ to read from */
+ local->sh_struct->offset_list[index] +=
+ UNIFY_SELF_HEAL_GETDENTS_COUNT;
+ STACK_WIND_COOKIE (frame,
+ unify_sh_getdents_cbk,
+ cookie,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->getdents,
+ local->fd,
+ UNIFY_SELF_HEAL_GETDENTS_COUNT,
+ local->sh_struct->offset_list[index],
+ GF_GET_ALL);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir on (%s) with offset %"PRId64"",
+ priv->xl_array[index]->name,
+ local->sh_struct->offset_list[index]);
+ }
+
+ if (!callcnt) {
+ /* All storage nodes have done unified setdents on NS node.
+ * Now, do getdents from NS and do setdents on storage nodes.
+ */
+
+ /* sh_struct->offset_list is no longer required for
+ storage nodes now */
+ local->sh_struct->offset_list[0] = 0; /* reset */
+
+ STACK_WIND (frame,
+ unify_sh_ns_getdents_cbk,
+ NS(this),
+ NS(this)->fops->getdents,
+ local->fd,
+ UNIFY_SELF_HEAL_GETDENTS_COUNT,
+ 0, /* In this call, do send '0' as offset */
+ GF_GET_DIR_ONLY);
+ }
+
+ return 0;
+}
+
+
+/**
+ * unify_sh_getdents_cbk -
+ */
+int32_t
+unify_sh_getdents_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dir_entry_t *entry,
+ int32_t count)
+{
+ int32_t callcnt = -1;
+ unify_local_t *local = frame->local;
+ unify_private_t *priv = this->private;
+ long index = (long)cookie;
+ dir_entry_t *tmp = NULL;
+
+ if (op_ret >= 0 && count > 0) {
+ /* There is some dentry found, just send the dentry to NS */
+ tmp = CALLOC (1, sizeof (dir_entry_t));
+ local->sh_struct->entry_list[index] = tmp;
+ local->sh_struct->count_list[index] = count;
+ if (entry) {
+ tmp->next = entry->next;
+ entry->next = NULL;
+ }
+ STACK_WIND_COOKIE (frame,
+ unify_sh_ns_setdents_cbk,
+ cookie,
+ NS(this),
+ NS(this)->fops->setdents,
+ local->fd,
+ GF_SET_IF_NOT_PRESENT,
+ local->sh_struct->entry_list[index],
+ count);
+ return 0;
+ }
+
+ if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) {
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+ } else {
+ /* count == size, that means, there are more entries
+ to read from */
+ local->sh_struct->offset_list[index] +=
+ UNIFY_SELF_HEAL_GETDENTS_COUNT;
+ STACK_WIND_COOKIE (frame,
+ unify_sh_getdents_cbk,
+ cookie,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->getdents,
+ local->fd,
+ UNIFY_SELF_HEAL_GETDENTS_COUNT,
+ local->sh_struct->offset_list[index],
+ GF_GET_ALL);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir on (%s) with offset %"PRId64"",
+ priv->xl_array[index]->name,
+ local->sh_struct->offset_list[index]);
+ }
+
+ if (!callcnt) {
+ /* All storage nodes have done unified setdents on NS node.
+ * Now, do getdents from NS and do setdents on storage nodes.
+ */
+
+ /* sh_struct->offset_list is no longer required for
+ storage nodes now */
+ local->sh_struct->offset_list[0] = 0; /* reset */
+
+ STACK_WIND (frame,
+ unify_sh_ns_getdents_cbk,
+ NS(this),
+ NS(this)->fops->getdents,
+ local->fd,
+ UNIFY_SELF_HEAL_GETDENTS_COUNT,
+ 0, /* In this call, do send '0' as offset */
+ GF_GET_DIR_ONLY);
+ }
+
+ return 0;
+}
+
+/**
+ * unify_sh_opendir_cbk -
+ *
+ * @cookie:
+ */
+int32_t
+unify_sh_opendir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd)
+{
+ int32_t callcnt = 0;
+ unify_local_t *local = frame->local;
+ unify_private_t *priv = this->private;
+ int16_t index = 0;
+ inode_t *inode = NULL;
+ dict_t *tmp_dict = NULL;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret >= 0) {
+ local->op_ret = op_ret;
+ } else {
+ gf_log (this->name, GF_LOG_WARNING, "failed");
+ local->failed = 1;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ local->call_count = priv->child_count + 1;
+
+ if (!local->failed) {
+ /* send getdents() namespace after finishing
+ storage nodes */
+ local->call_count--;
+
+ fd_bind (fd);
+
+ if (local->call_count) {
+ /* Used as the offset index. This list keeps
+ * track of offset sent to each node during
+ * STACK_WIND.
+ */
+ local->sh_struct->offset_list =
+ calloc (priv->child_count,
+ sizeof (off_t));
+ ERR_ABORT (local->sh_struct->offset_list);
+
+ local->sh_struct->entry_list =
+ calloc (priv->child_count,
+ sizeof (dir_entry_t *));
+ ERR_ABORT (local->sh_struct->entry_list);
+
+ local->sh_struct->count_list =
+ calloc (priv->child_count,
+ sizeof (int));
+ ERR_ABORT (local->sh_struct->count_list);
+
+ /* Send getdents on all the fds */
+ for (index = 0;
+ index < priv->child_count; index++) {
+ STACK_WIND_COOKIE (frame,
+ unify_sh_getdents_cbk,
+ (void *)(long)index,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->getdents,
+ local->fd,
+ UNIFY_SELF_HEAL_GETDENTS_COUNT,
+ 0, /* In this call, do send '0' as offset */
+ GF_GET_ALL);
+ }
+
+ /* did stack wind, so no need to unwind here */
+ return 0;
+ } /* (local->call_count) */
+ } /* (!local->failed) */
+
+ /* Opendir failed on one node. */
+ inode = local->loc1.inode;
+ fd_unref (local->fd);
+ tmp_dict = local->dict;
+
+ unify_local_wipe (local);
+ /* Only 'self-heal' failed, lookup() was successful. */
+ local->op_ret = 0;
+
+ /* This is lookup_cbk ()'s UNWIND. */
+ STACK_UNWIND (frame, local->op_ret, local->op_errno, inode,
+ &local->stbuf, local->dict);
+ if (tmp_dict)
+ dict_unref (tmp_dict);
+ }
+
+ return 0;
+}
+
+/**
+ * gf_sh_checksum_cbk -
+ *
+ * @frame: frame used in lookup. get a copy of it, and use that copy.
+ * @this: pointer to unify xlator.
+ * @inode: pointer to inode, for which the consistency check is required.
+ *
+ */
+int32_t
+unify_sh_checksum_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ uint8_t *file_checksum,
+ uint8_t *dir_checksum)
+{
+ unify_local_t *local = frame->local;
+ unify_private_t *priv = this->private;
+ int16_t index = 0;
+ int32_t callcnt = 0;
+ inode_t *inode = NULL;
+ dict_t *tmp_dict = NULL;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret >= 0) {
+ if (NS(this) == (xlator_t *)cookie) {
+ memcpy (local->sh_struct->ns_file_checksum,
+ file_checksum, ZR_FILENAME_MAX);
+ memcpy (local->sh_struct->ns_dir_checksum,
+ dir_checksum, ZR_FILENAME_MAX);
+ } else {
+ if (local->entry_count == 0) {
+ /* Initialize the dir_checksum to be
+ * used for comparision with other
+ * storage nodes. Should be done for
+ * the first successful call *only*.
+ */
+ /* Using 'entry_count' as a flag */
+ local->entry_count = 1;
+ memcpy (local->sh_struct->dir_checksum,
+ dir_checksum, ZR_FILENAME_MAX);
+ }
+
+ /* Reply from the storage nodes */
+ for (index = 0;
+ index < ZR_FILENAME_MAX; index++) {
+ /* Files should be present in
+ only one node */
+ local->sh_struct->file_checksum[index] ^= file_checksum[index];
+
+ /* directory structure should be
+ same accross */
+ if (local->sh_struct->dir_checksum[index] != dir_checksum[index])
+ local->failed = 1;
+ }
+ }
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ for (index = 0; index < ZR_FILENAME_MAX ; index++) {
+ if (local->sh_struct->file_checksum[index] !=
+ local->sh_struct->ns_file_checksum[index]) {
+ local->failed = 1;
+ break;
+ }
+ if (local->sh_struct->dir_checksum[index] !=
+ local->sh_struct->ns_dir_checksum[index]) {
+ local->failed = 1;
+ break;
+ }
+ }
+
+ if (local->failed) {
+ /* Log it, it should be a rare event */
+ gf_log (this->name, GF_LOG_WARNING,
+ "Self-heal triggered on directory %s",
+ local->loc1.path);
+
+ /* Any self heal will be done at directory level */
+ local->call_count = 0;
+ local->op_ret = -1;
+ local->failed = 0;
+
+ local->fd = fd_create (local->loc1.inode,
+ frame->root->pid);
+
+ local->call_count = priv->child_count + 1;
+
+ for (index = 0;
+ index < (priv->child_count + 1); index++) {
+ STACK_WIND_COOKIE (frame,
+ unify_sh_opendir_cbk,
+ priv->xl_array[index]->name,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->opendir,
+ &local->loc1,
+ local->fd);
+ }
+ /* opendir can be done on the directory */
+ return 0;
+ }
+
+ /* no mismatch */
+ inode = local->loc1.inode;
+ tmp_dict = local->dict;
+
+ unify_local_wipe (local);
+
+ /* This is lookup_cbk ()'s UNWIND. */
+ STACK_UNWIND (frame,
+ local->op_ret,
+ local->op_errno,
+ inode,
+ &local->stbuf,
+ local->dict);
+ if (tmp_dict)
+ dict_unref (tmp_dict);
+ }
+
+ return 0;
+}
+
+/* Foreground self-heal part over */
+
+/* Background self-heal part */
+
+int32_t
+unify_bgsh_setdents_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int32_t callcnt = -1;
+ unify_local_t *local = frame->local;
+ dir_entry_t *prev, *entry, *trav;
+
+ LOCK (&frame->lock);
+ {
+ /* if local->call_count == 0, that means, setdents
+ on storagenodes is still pending. */
+ if (local->call_count)
+ callcnt = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+
+ if (callcnt == 0) {
+ if (local->sh_struct->entry_list[0]) {
+ prev = entry = local->sh_struct->entry_list[0];
+ trav = entry->next;
+ while (trav) {
+ prev->next = trav->next;
+ FREE (trav->name);
+ if (S_ISLNK (trav->buf.st_mode))
+ FREE (trav->link);
+ FREE (trav);
+ trav = prev->next;
+ }
+ FREE (entry);
+ }
+
+ if (!local->flags) {
+ if (local->sh_struct->count_list[0] >=
+ UNIFY_SELF_HEAL_GETDENTS_COUNT) {
+ /* count == size, that means, there are more
+ entries to read from */
+ //local->call_count = 0;
+ local->sh_struct->offset_list[0] +=
+ UNIFY_SELF_HEAL_GETDENTS_COUNT;
+ STACK_WIND (frame,
+ unify_bgsh_ns_getdents_cbk,
+ NS(this),
+ NS(this)->fops->getdents,
+ local->fd,
+ UNIFY_SELF_HEAL_GETDENTS_COUNT,
+ local->sh_struct->offset_list[0],
+ GF_GET_DIR_ONLY);
+ }
+ } else {
+ fd_unref (local->fd);
+ unify_local_wipe (local);
+ STACK_DESTROY (frame->root);
+ }
+ }
+
+ return 0;
+}
+
+
+int32_t
+unify_bgsh_ns_getdents_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dir_entry_t *entry,
+ int32_t count)
+{
+ unify_local_t *local = frame->local;
+ unify_private_t *priv = this->private;
+ long index = 0;
+ unsigned long final = 0;
+ dir_entry_t *tmp = CALLOC (1, sizeof (dir_entry_t));
+
+ local->sh_struct->entry_list[0] = tmp;
+ local->sh_struct->count_list[0] = count;
+ if (entry) {
+ tmp->next = entry->next;
+ entry->next = NULL;
+ }
+
+ if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) {
+ final = 1;
+ }
+
+ LOCK (&frame->lock);
+ {
+ /* local->call_count will be '0' till now. make it 1 so,
+ it can be UNWIND'ed for the last call. */
+ local->call_count = priv->child_count;
+ if (final)
+ local->flags = 1;
+ }
+ UNLOCK (&frame->lock);
+
+ for (index = 0; index < priv->child_count; index++)
+ {
+ STACK_WIND_COOKIE (frame,
+ unify_bgsh_setdents_cbk,
+ (void *)index,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->setdents,
+ local->fd, GF_SET_DIR_ONLY,
+ local->sh_struct->entry_list[0], count);
+ }
+
+ return 0;
+}
+
+int32_t
+unify_bgsh_ns_setdents_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int32_t callcnt = -1;
+ unify_local_t *local = frame->local;
+ unify_private_t *priv = this->private;
+ long index = (long)cookie;
+ dir_entry_t *prev, *entry, *trav;
+
+ if (local->sh_struct->entry_list[index]) {
+ prev = entry = local->sh_struct->entry_list[index];
+ if (!entry)
+ return 0;
+ trav = entry->next;
+ while (trav) {
+ prev->next = trav->next;
+ FREE (trav->name);
+ if (S_ISLNK (trav->buf.st_mode))
+ FREE (trav->link);
+ FREE (trav);
+ trav = prev->next;
+ }
+ FREE (entry);
+ }
+
+ if (local->sh_struct->count_list[index] <
+ UNIFY_SELF_HEAL_GETDENTS_COUNT) {
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+ } else {
+ /* count == size, that means, there are more entries
+ to read from */
+ local->sh_struct->offset_list[index] +=
+ UNIFY_SELF_HEAL_GETDENTS_COUNT;
+ STACK_WIND_COOKIE (frame,
+ unify_bgsh_getdents_cbk,
+ cookie,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->getdents,
+ local->fd,
+ UNIFY_SELF_HEAL_GETDENTS_COUNT,
+ local->sh_struct->offset_list[index],
+ GF_GET_ALL);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir on (%s) with offset %"PRId64"",
+ priv->xl_array[index]->name,
+ local->sh_struct->offset_list[index]);
+ }
+
+ if (!callcnt) {
+ /* All storage nodes have done unified setdents on NS node.
+ * Now, do getdents from NS and do setdents on storage nodes.
+ */
+
+ /* sh_struct->offset_list is no longer required for
+ storage nodes now */
+ local->sh_struct->offset_list[0] = 0; /* reset */
+
+ STACK_WIND (frame,
+ unify_bgsh_ns_getdents_cbk,
+ NS(this),
+ NS(this)->fops->getdents,
+ local->fd,
+ UNIFY_SELF_HEAL_GETDENTS_COUNT,
+ 0, /* In this call, do send '0' as offset */
+ GF_GET_DIR_ONLY);
+ }
+
+ return 0;
+}
+
+
+/**
+ * unify_bgsh_getdents_cbk -
+ */
+int32_t
+unify_bgsh_getdents_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dir_entry_t *entry,
+ int32_t count)
+{
+ int32_t callcnt = -1;
+ unify_local_t *local = frame->local;
+ unify_private_t *priv = this->private;
+ long index = (long)cookie;
+ dir_entry_t *tmp = NULL;
+
+ if (op_ret >= 0 && count > 0) {
+ /* There is some dentry found, just send the dentry to NS */
+ tmp = CALLOC (1, sizeof (dir_entry_t));
+ local->sh_struct->entry_list[index] = tmp;
+ local->sh_struct->count_list[index] = count;
+ if (entry) {
+ tmp->next = entry->next;
+ entry->next = NULL;
+ }
+ STACK_WIND_COOKIE (frame,
+ unify_bgsh_ns_setdents_cbk,
+ cookie,
+ NS(this),
+ NS(this)->fops->setdents,
+ local->fd,
+ GF_SET_IF_NOT_PRESENT,
+ local->sh_struct->entry_list[index],
+ count);
+ return 0;
+ }
+
+ if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) {
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+ } else {
+ /* count == size, that means, there are more entries to read from */
+ local->sh_struct->offset_list[index] +=
+ UNIFY_SELF_HEAL_GETDENTS_COUNT;
+
+ STACK_WIND_COOKIE (frame,
+ unify_bgsh_getdents_cbk,
+ cookie,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->getdents,
+ local->fd,
+ UNIFY_SELF_HEAL_GETDENTS_COUNT,
+ local->sh_struct->offset_list[index],
+ GF_GET_ALL);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir on (%s) with offset %"PRId64"",
+ priv->xl_array[index]->name,
+ local->sh_struct->offset_list[index]);
+ }
+
+ if (!callcnt) {
+ /* All storage nodes have done unified setdents on NS node.
+ * Now, do getdents from NS and do setdents on storage nodes.
+ */
+
+ /* sh_struct->offset_list is no longer required for
+ storage nodes now */
+ local->sh_struct->offset_list[0] = 0; /* reset */
+
+ STACK_WIND (frame,
+ unify_bgsh_ns_getdents_cbk,
+ NS(this),
+ NS(this)->fops->getdents,
+ local->fd,
+ UNIFY_SELF_HEAL_GETDENTS_COUNT,
+ 0, /* In this call, do send '0' as offset */
+ GF_GET_DIR_ONLY);
+ }
+
+ return 0;
+}
+
+/**
+ * unify_bgsh_opendir_cbk -
+ *
+ * @cookie:
+ */
+int32_t
+unify_bgsh_opendir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd)
+{
+ unify_local_t *local = frame->local;
+ unify_private_t *priv = this->private;
+ int32_t callcnt = 0;
+ int16_t index = 0;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret >= 0) {
+ local->op_ret = op_ret;
+ } else {
+ local->failed = 1;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ local->call_count = priv->child_count + 1;
+
+ if (!local->failed) {
+ /* send getdents() namespace after finishing
+ storage nodes */
+ local->call_count--;
+ callcnt = local->call_count;
+
+ fd_bind (fd);
+
+ if (local->call_count) {
+ /* Used as the offset index. This list keeps
+ track of offset sent to each node during
+ STACK_WIND. */
+ local->sh_struct->offset_list =
+ calloc (priv->child_count,
+ sizeof (off_t));
+ ERR_ABORT (local->sh_struct->offset_list);
+
+ local->sh_struct->entry_list =
+ calloc (priv->child_count,
+ sizeof (dir_entry_t *));
+ ERR_ABORT (local->sh_struct->entry_list);
+
+ local->sh_struct->count_list =
+ calloc (priv->child_count,
+ sizeof (int));
+ ERR_ABORT (local->sh_struct->count_list);
+
+ /* Send getdents on all the fds */
+ for (index = 0;
+ index < priv->child_count; index++) {
+ STACK_WIND_COOKIE (frame,
+ unify_bgsh_getdents_cbk,
+ (void *)(long)index,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->getdents,
+ local->fd,
+ UNIFY_SELF_HEAL_GETDENTS_COUNT,
+ 0, /* In this call, do send '0' as offset */
+ GF_GET_ALL);
+ }
+ /* did a stack wind, so no need to unwind here */
+ return 0;
+ } /* (local->call_count) */
+ } /* (!local->failed) */
+
+ /* Opendir failed on one node. */
+ fd_unref (local->fd);
+
+ unify_local_wipe (local);
+ STACK_DESTROY (frame->root);
+ }
+
+ return 0;
+}
+
+/**
+ * gf_bgsh_checksum_cbk -
+ *
+ * @frame: frame used in lookup. get a copy of it, and use that copy.
+ * @this: pointer to unify xlator.
+ * @inode: pointer to inode, for which the consistency check is required.
+ *
+ */
+int32_t
+unify_bgsh_checksum_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ uint8_t *file_checksum,
+ uint8_t *dir_checksum)
+{
+ unify_local_t *local = frame->local;
+ unify_private_t *priv = this->private;
+ int16_t index = 0;
+ int32_t callcnt = 0;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret >= 0) {
+ if (NS(this) == (xlator_t *)cookie) {
+ memcpy (local->sh_struct->ns_file_checksum,
+ file_checksum, ZR_FILENAME_MAX);
+ memcpy (local->sh_struct->ns_dir_checksum,
+ dir_checksum, ZR_FILENAME_MAX);
+ } else {
+ if (local->entry_count == 0) {
+ /* Initialize the dir_checksum to be
+ * used for comparision with other
+ * storage nodes. Should be done for
+ * the first successful call *only*.
+ */
+ /* Using 'entry_count' as a flag */
+ local->entry_count = 1;
+ memcpy (local->sh_struct->dir_checksum,
+ dir_checksum, ZR_FILENAME_MAX);
+ }
+
+ /* Reply from the storage nodes */
+ for (index = 0;
+ index < ZR_FILENAME_MAX; index++) {
+ /* Files should be present in only
+ one node */
+ local->sh_struct->file_checksum[index] ^= file_checksum[index];
+
+ /* directory structure should be same
+ accross */
+ if (local->sh_struct->dir_checksum[index] != dir_checksum[index])
+ local->failed = 1;
+ }
+ }
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ for (index = 0; index < ZR_FILENAME_MAX ; index++) {
+ if (local->sh_struct->file_checksum[index] !=
+ local->sh_struct->ns_file_checksum[index]) {
+ local->failed = 1;
+ break;
+ }
+ if (local->sh_struct->dir_checksum[index] !=
+ local->sh_struct->ns_dir_checksum[index]) {
+ local->failed = 1;
+ break;
+ }
+ }
+
+ if (local->failed) {
+ /* Log it, it should be a rare event */
+ gf_log (this->name, GF_LOG_WARNING,
+ "Self-heal triggered on directory %s",
+ local->loc1.path);
+
+ /* Any self heal will be done at the directory level */
+ local->op_ret = -1;
+ local->failed = 0;
+
+ local->fd = fd_create (local->loc1.inode,
+ frame->root->pid);
+ local->call_count = priv->child_count + 1;
+
+ for (index = 0;
+ index < (priv->child_count + 1); index++) {
+ STACK_WIND_COOKIE (frame,
+ unify_bgsh_opendir_cbk,
+ priv->xl_array[index]->name,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->opendir,
+ &local->loc1,
+ local->fd);
+ }
+
+ /* opendir can be done on the directory */
+ return 0;
+ }
+
+ /* no mismatch */
+ unify_local_wipe (local);
+ STACK_DESTROY (frame->root);
+ }
+
+ return 0;
+}
+
+/* Background self-heal part over */
+
+
+
+
+/**
+ * zr_unify_self_heal -
+ *
+ * @frame: frame used in lookup. get a copy of it, and use that copy.
+ * @this: pointer to unify xlator.
+ * @inode: pointer to inode, for which the consistency check is required.
+ *
+ */
+int32_t
+zr_unify_self_heal (call_frame_t *frame,
+ xlator_t *this,
+ unify_local_t *local)
+{
+ unify_private_t *priv = this->private;
+ call_frame_t *bg_frame = NULL;
+ unify_local_t *bg_local = NULL;
+ inode_t *tmp_inode = NULL;
+ dict_t *tmp_dict = NULL;
+ int16_t index = 0;
+
+ if (local->inode_generation < priv->inode_generation) {
+ /* Any self heal will be done at the directory level */
+ /* Update the inode's generation to the current generation
+ value. */
+ local->inode_generation = priv->inode_generation;
+ inode_ctx_put (local->loc1.inode, this,
+ (uint64_t)(long)local->inode_generation);
+
+ if (priv->self_heal == ZR_UNIFY_FG_SELF_HEAL) {
+ local->op_ret = 0;
+ local->failed = 0;
+ local->call_count = priv->child_count + 1;
+ local->sh_struct =
+ calloc (1, sizeof (struct unify_self_heal_struct));
+
+ /* +1 is for NS */
+ for (index = 0;
+ index < (priv->child_count + 1); index++) {
+ STACK_WIND_COOKIE (frame,
+ unify_sh_checksum_cbk,
+ priv->xl_array[index],
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->checksum,
+ &local->loc1,
+ 0);
+ }
+
+ /* Self-heal in foreground, hence no need
+ to UNWIND here */
+ return 0;
+ }
+
+ /* Self Heal done in background */
+ bg_frame = copy_frame (frame);
+ INIT_LOCAL (bg_frame, bg_local);
+ loc_copy (&bg_local->loc1, &local->loc1);
+ bg_local->op_ret = 0;
+ bg_local->failed = 0;
+ bg_local->call_count = priv->child_count + 1;
+ bg_local->sh_struct =
+ calloc (1, sizeof (struct unify_self_heal_struct));
+
+ /* +1 is for NS */
+ for (index = 0; index < (priv->child_count + 1); index++) {
+ STACK_WIND_COOKIE (bg_frame,
+ unify_bgsh_checksum_cbk,
+ priv->xl_array[index],
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->checksum,
+ &bg_local->loc1,
+ 0);
+ }
+ }
+
+ /* generation number matches, self heal already done or
+ * self heal done in background: just do STACK_UNWIND
+ */
+ tmp_inode = local->loc1.inode;
+ tmp_dict = local->dict;
+
+ unify_local_wipe (local);
+
+ /* This is lookup_cbk ()'s UNWIND. */
+ STACK_UNWIND (frame,
+ local->op_ret,
+ local->op_errno,
+ tmp_inode,
+ &local->stbuf,
+ local->dict);
+
+ if (tmp_dict)
+ dict_unref (tmp_dict);
+
+ return 0;
+}
+
diff --git a/xlators/cluster/unify/src/unify.c b/xlators/cluster/unify/src/unify.c
new file mode 100644
index 000000000..e2a5e14b1
--- /dev/null
+++ b/xlators/cluster/unify/src/unify.c
@@ -0,0 +1,4451 @@
+/*
+ Copyright (c) 2006, 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * xlators/cluster/unify:
+ * - This xlator is one of the main translator in GlusterFS, which
+ * actually does the clustering work of the file system. One need to
+ * understand that, unify assumes file to be existing in only one of
+ * the child node, and directories to be present on all the nodes.
+ *
+ * NOTE:
+ * Now, unify has support for global namespace, which is used to keep a
+ * global view of fs's namespace tree. The stat for directories are taken
+ * just from the namespace, where as for files, just 'st_ino' is taken from
+ * Namespace node, and other stat info is taken from the actual storage node.
+ * Also Namespace node helps to keep consistant inode for files across
+ * glusterfs (re-)mounts.
+ */
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "unify.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include <signal.h>
+#include <libgen.h>
+#include "compat-errno.h"
+#include "compat.h"
+
+#define UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR(_loc) do { \
+ if (!(_loc && _loc->inode)) { \
+ STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); \
+ return 0; \
+ } \
+} while(0)
+
+
+#define UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(_fd) do { \
+ if (!(_fd && !fd_ctx_get (_fd, this, NULL))) { \
+ STACK_UNWIND (frame, -1, EBADFD, NULL, NULL); \
+ return 0; \
+ } \
+} while(0)
+
+#define UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(_fd) do { \
+ if (!_fd) { \
+ STACK_UNWIND (frame, -1, EBADFD, NULL, NULL); \
+ return 0; \
+ } \
+} while(0)
+
+/**
+ * unify_local_wipe - free all the extra allocation of local->* here.
+ */
+static void
+unify_local_wipe (unify_local_t *local)
+{
+ /* Free the strdup'd variables in the local structure */
+ if (local->name) {
+ FREE (local->name);
+ }
+ loc_wipe (&local->loc1);
+ loc_wipe (&local->loc2);
+}
+
+
+
+/*
+ * unify_normalize_stats -
+ */
+void
+unify_normalize_stats (struct statvfs *buf,
+ unsigned long bsize,
+ unsigned long frsize)
+{
+ double factor;
+
+ if (buf->f_bsize != bsize) {
+ factor = ((double) buf->f_bsize) / bsize;
+ buf->f_bsize = bsize;
+ buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree);
+ buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail);
+ }
+
+ if (buf->f_frsize != frsize) {
+ factor = ((double) buf->f_frsize) / frsize;
+ buf->f_frsize = frsize;
+ buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks);
+ }
+}
+
+
+xlator_t *
+unify_loc_subvol (loc_t *loc, xlator_t *this)
+{
+ unify_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ int16_t *list = NULL;
+ long index = 0;
+ xlator_t *subvol_i = NULL;
+ int ret = 0;
+ uint64_t tmp_list = 0;
+
+ priv = this->private;
+ subvol = NS (this);
+
+ if (!S_ISDIR (loc->inode->st_mode)) {
+ ret = inode_ctx_get (loc->inode, this, &tmp_list);
+ list = (int16_t *)(long)tmp_list;
+ if (!list)
+ goto out;
+
+ for (index = 0; list[index] != -1; index++) {
+ subvol_i = priv->xl_array[list[index]];
+ if (subvol_i != NS (this)) {
+ subvol = subvol_i;
+ break;
+ }
+ }
+ }
+out:
+ return subvol;
+}
+
+
+
+/**
+ * unify_statfs_cbk -
+ */
+int32_t
+unify_statfs_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct statvfs *stbuf)
+{
+ int32_t callcnt = 0;
+ struct statvfs *dict_buf = NULL;
+ unsigned long bsize;
+ unsigned long frsize;
+ unify_local_t *local = (unify_local_t *)frame->local;
+ call_frame_t *prev_frame = cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret >= 0) {
+ /* when a call is successfull, add it to local->dict */
+ dict_buf = &local->statvfs_buf;
+
+ if (dict_buf->f_bsize != 0) {
+ bsize = max (dict_buf->f_bsize,
+ stbuf->f_bsize);
+
+ frsize = max (dict_buf->f_frsize,
+ stbuf->f_frsize);
+ unify_normalize_stats(dict_buf, bsize, frsize);
+ unify_normalize_stats(stbuf, bsize, frsize);
+ } else {
+ dict_buf->f_bsize = stbuf->f_bsize;
+ dict_buf->f_frsize = stbuf->f_frsize;
+ }
+
+ dict_buf->f_blocks += stbuf->f_blocks;
+ dict_buf->f_bfree += stbuf->f_bfree;
+ dict_buf->f_bavail += stbuf->f_bavail;
+ dict_buf->f_files += stbuf->f_files;
+ dict_buf->f_ffree += stbuf->f_ffree;
+ dict_buf->f_favail += stbuf->f_favail;
+ dict_buf->f_fsid = stbuf->f_fsid;
+ dict_buf->f_flag = stbuf->f_flag;
+ dict_buf->f_namemax = stbuf->f_namemax;
+ local->op_ret = op_ret;
+ } else {
+ /* fop on storage node has failed due to some error */
+ if (op_errno != ENOTCONN) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "child(%s): %s",
+ prev_frame->this->name,
+ strerror (op_errno));
+ }
+ local->op_errno = op_errno;
+ }
+ callcnt = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->statvfs_buf);
+ }
+
+ return 0;
+}
+
+/**
+ * unify_statfs -
+ */
+int32_t
+unify_statfs (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ unify_local_t *local = NULL;
+ xlator_list_t *trav = this->children;
+
+ INIT_LOCAL (frame, local);
+ local->call_count = ((unify_private_t *)this->private)->child_count;
+
+ while(trav) {
+ STACK_WIND (frame,
+ unify_statfs_cbk,
+ trav->xlator,
+ trav->xlator->fops->statfs,
+ loc);
+ trav = trav->next;
+ }
+
+ return 0;
+}
+
+/**
+ * unify_buf_cbk -
+ */
+int32_t
+unify_buf_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ int32_t callcnt = 0;
+ unify_private_t *priv = this->private;
+ unify_local_t *local = frame->local;
+ call_frame_t *prev_frame = cookie;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s(): child(%s): path(%s): %s",
+ gf_fop_list[frame->root->op],
+ prev_frame->this->name,
+ (local->loc1.path)?local->loc1.path:"",
+ strerror (op_errno));
+
+ local->op_errno = op_errno;
+ if ((op_errno == ENOENT) && priv->optimist)
+ local->op_ret = 0;
+ }
+
+ if (op_ret >= 0) {
+ local->op_ret = 0;
+
+ if (NS (this) == prev_frame->this) {
+ local->st_ino = buf->st_ino;
+ /* If the entry is directory, get the stat
+ from NS node */
+ if (S_ISDIR (buf->st_mode) ||
+ !local->stbuf.st_blksize) {
+ local->stbuf = *buf;
+ }
+ }
+
+ if ((!S_ISDIR (buf->st_mode)) &&
+ (NS (this) != prev_frame->this)) {
+ /* If file, take the stat info from Storage
+ node. */
+ local->stbuf = *buf;
+ }
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ /* If the inode number is not filled, operation should
+ fail */
+ if (!local->st_ino)
+ local->op_ret = -1;
+
+ local->stbuf.st_ino = local->st_ino;
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->stbuf);
+ }
+
+ return 0;
+}
+
+#define check_if_dht_linkfile(s) ((s->st_mode & ~S_IFMT) == S_ISVTX)
+
+/**
+ * unify_lookup_cbk -
+ */
+int32_t
+unify_lookup_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf,
+ dict_t *dict)
+{
+ int32_t callcnt = 0;
+ unify_private_t *priv = this->private;
+ unify_local_t *local = frame->local;
+ inode_t *tmp_inode = NULL;
+ dict_t *local_dict = NULL;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ if ((op_errno != ENOTCONN) && (op_errno != ENOENT)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "child(%s): path(%s): %s",
+ priv->xl_array[(long)cookie]->name,
+ local->loc1.path, strerror (op_errno));
+ local->op_errno = op_errno;
+ local->failed = 1;
+
+ } else if (local->revalidate &&
+ !(priv->optimist && (op_errno == ENOENT))) {
+
+ gf_log (this->name,
+ (op_errno == ENOTCONN) ?
+ GF_LOG_DEBUG:GF_LOG_ERROR,
+ "child(%s): path(%s): %s",
+ priv->xl_array[(long)cookie]->name,
+ local->loc1.path, strerror (op_errno));
+ local->op_errno = op_errno;
+ local->failed = 1;
+ }
+ }
+
+ if (op_ret == 0) {
+ local->op_ret = 0;
+
+ if (check_if_dht_linkfile(buf)) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "file %s may be DHT link file on %s, "
+ "make sure the backend is not shared "
+ "between unify and DHT",
+ local->loc1.path,
+ priv->xl_array[(long)cookie]->name);
+ }
+
+ if (local->stbuf.st_mode && local->stbuf.st_blksize) {
+ /* make sure we already have a stbuf
+ stored in local->stbuf */
+ if (S_ISDIR (local->stbuf.st_mode) &&
+ !S_ISDIR (buf->st_mode)) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "[CRITICAL] '%s' is directory "
+ "on namespace, non-directory "
+ "on node '%s', returning EIO",
+ local->loc1.path,
+ priv->xl_array[(long)cookie]->name);
+ local->return_eio = 1;
+ }
+ if (!S_ISDIR (local->stbuf.st_mode) &&
+ S_ISDIR (buf->st_mode)) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "[CRITICAL] '%s' is directory "
+ "on node '%s', non-directory "
+ "on namespace, returning EIO",
+ local->loc1.path,
+ priv->xl_array[(long)cookie]->name);
+ local->return_eio = 1;
+ }
+ }
+
+ if (!local->revalidate && !S_ISDIR (buf->st_mode)) {
+ /* This is the first time lookup on file*/
+ if (!local->list) {
+ /* list is not allocated, allocate
+ the max possible range */
+ local->list = CALLOC (1, 2 * (priv->child_count + 2));
+ if (!local->list) {
+ gf_log (this->name,
+ GF_LOG_CRITICAL,
+ "Not enough memory");
+ STACK_UNWIND (frame, -1,
+ ENOMEM, inode,
+ NULL, NULL);
+ return 0;
+ }
+ }
+ /* update the index of the list */
+ local->list [local->index++] =
+ (int16_t)(long)cookie;
+ }
+
+ if ((!local->dict) && dict &&
+ (priv->xl_array[(long)cookie] != NS(this))) {
+ local->dict = dict_ref (dict);
+ }
+
+ /* index of NS node is == total child count */
+ if (priv->child_count == (int16_t)(long)cookie) {
+ /* Take the inode number from namespace */
+ local->st_ino = buf->st_ino;
+ if (S_ISDIR (buf->st_mode) ||
+ !(local->stbuf.st_blksize)) {
+ local->stbuf = *buf;
+ }
+ } else if (!S_ISDIR (buf->st_mode)) {
+ /* If file, then get the stat from
+ storage node */
+ local->stbuf = *buf;
+ }
+
+ if (local->st_nlink < buf->st_nlink) {
+ local->st_nlink = buf->st_nlink;
+ }
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ local_dict = local->dict;
+ if (local->return_eio) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "[CRITICAL] Unable to fix the path (%s) with "
+ "self-heal, try manual verification. "
+ "returning EIO.", local->loc1.path);
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, -1, EIO, inode, NULL, NULL);
+ if (local_dict) {
+ dict_unref (local_dict);
+ }
+ return 0;
+ }
+
+ if (!local->stbuf.st_blksize) {
+ /* Inode not present */
+ local->op_ret = -1;
+ } else {
+ if (!local->revalidate &&
+ !S_ISDIR (local->stbuf.st_mode)) {
+ /* If its a file, big array is useless,
+ allocate the smaller one */
+ int16_t *list = NULL;
+ list = CALLOC (1, 2 * (local->index + 1));
+ ERR_ABORT (list);
+ memcpy (list, local->list, 2 * local->index);
+ /* Make the end of the list as -1 */
+ FREE (local->list);
+ local->list = list;
+ local->list [local->index] = -1;
+ /* Update the inode's ctx with proper array */
+ /* TODO: log on failure */
+ inode_ctx_put (local->loc1.inode, this,
+ (uint64_t)(long)local->list);
+ }
+
+ if (S_ISDIR(local->loc1.inode->st_mode)) {
+ /* lookup is done for directory */
+ if (local->failed && priv->self_heal) {
+ /* Triggering self-heal */
+ /* means, self-heal required for this
+ inode */
+ local->inode_generation = 0;
+ priv->inode_generation++;
+ }
+ } else {
+ local->stbuf.st_ino = local->st_ino;
+ }
+
+ local->stbuf.st_nlink = local->st_nlink;
+ }
+ if (local->op_ret == -1) {
+ if (!local->revalidate && local->list)
+ FREE (local->list);
+ }
+
+ if ((local->op_ret >= 0) && local->failed &&
+ local->revalidate) {
+ /* Done revalidate, but it failed */
+ if (op_errno != ENOTCONN) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Revalidate failed for path(%s): %s",
+ local->loc1.path, strerror (op_errno));
+ }
+ local->op_ret = -1;
+ }
+
+ if ((priv->self_heal && !priv->optimist) &&
+ (!local->revalidate && (local->op_ret == 0) &&
+ S_ISDIR(local->stbuf.st_mode))) {
+ /* Let the self heal be done here */
+ zr_unify_self_heal (frame, this, local);
+ local_dict = NULL;
+ } else {
+ /* either no self heal, or op_ret == -1 (failure) */
+ tmp_inode = local->loc1.inode;
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ tmp_inode, &local->stbuf, local->dict);
+ }
+ if (local_dict) {
+ dict_unref (local_dict);
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * unify_lookup -
+ */
+int32_t
+unify_lookup (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ dict_t *xattr_req)
+{
+ unify_local_t *local = NULL;
+ unify_private_t *priv = this->private;
+ int16_t *list = NULL;
+ long index = 0;
+
+ if (!(loc && loc->inode)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: Argument not right", loc?loc->path:"(null)");
+ STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL);
+ return 0;
+ }
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+ loc_copy (&local->loc1, loc);
+ if (local->loc1.path == NULL) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
+ STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL, NULL);
+ return 0;
+ }
+
+ if (!inode_ctx_get (loc->inode, this, NULL) &&
+ loc->inode->st_mode &&
+ !S_ISDIR (loc->inode->st_mode)) {
+ uint64_t tmp_list = 0;
+ /* check if revalidate or fresh lookup */
+ inode_ctx_get (loc->inode, this, &tmp_list);
+ local->list = (int16_t *)(long)tmp_list;
+ }
+
+ if (local->list) {
+ list = local->list;
+ for (index = 0; list[index] != -1; index++);
+ if (index != 2) {
+ if (index < 2) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "returning ESTALE for %s: file "
+ "count is %ld", loc->path, index);
+ /* Print where all the file is present */
+ for (index = 0;
+ local->list[index] != -1; index++) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: found on %s", loc->path,
+ priv->xl_array[list[index]]->name);
+ }
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, -1, ESTALE,
+ NULL, NULL, NULL);
+ return 0;
+ } else {
+ /* There are more than 2 presences */
+ /* Just log and continue */
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: file count is %ld",
+ loc->path, index);
+ /* Print where all the file is present */
+ for (index = 0;
+ local->list[index] != -1; index++) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: found on %s", loc->path,
+ priv->xl_array[list[index]]->name);
+ }
+ }
+ }
+
+ /* is revalidate */
+ local->revalidate = 1;
+
+ for (index = 0; list[index] != -1; index++)
+ local->call_count++;
+
+ for (index = 0; list[index] != -1; index++) {
+ char need_break = (list[index+1] == -1);
+ STACK_WIND_COOKIE (frame,
+ unify_lookup_cbk,
+ (void *)(long)list[index], //cookie
+ priv->xl_array [list[index]],
+ priv->xl_array [list[index]]->fops->lookup,
+ loc,
+ xattr_req);
+ if (need_break)
+ break;
+ }
+ } else {
+ if (loc->inode->st_mode) {
+ if (inode_ctx_get (loc->inode, this, NULL)) {
+ inode_ctx_get (loc->inode, this,
+ &local->inode_generation);
+ }
+ }
+ /* This is first call, there is no list */
+ /* call count should be all child + 1 namespace */
+ local->call_count = priv->child_count + 1;
+
+ for (index = 0; index <= priv->child_count; index++) {
+ STACK_WIND_COOKIE (frame,
+ unify_lookup_cbk,
+ (void *)index, //cookie
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->lookup,
+ loc,
+ xattr_req);
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * unify_stat - if directory, get the stat directly from NameSpace child.
+ * if file, check for a hint and send it only there (also to NS).
+ * if its a fresh stat, then do it on all the nodes.
+ *
+ * NOTE: for all the call, sending cookie as xlator pointer, which will be
+ * used in cbk.
+ */
+int32_t
+unify_stat (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ unify_local_t *local = NULL;
+ unify_private_t *priv = this->private;
+ int16_t index = 0;
+ int16_t *list = NULL;
+ uint64_t tmp_list = 0;
+
+ UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+ loc_copy (&local->loc1, loc);
+ if (local->loc1.path == NULL) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
+ STACK_UNWIND (frame, -1, ENOMEM, NULL);
+ return 0;
+ }
+ local->st_ino = loc->inode->ino;
+ if (S_ISDIR (loc->inode->st_mode)) {
+ /* Directory */
+ local->call_count = 1;
+ STACK_WIND (frame, unify_buf_cbk, NS(this),
+ NS(this)->fops->stat, loc);
+ } else {
+ /* File */
+ inode_ctx_get (loc->inode, this, &tmp_list);
+ list = (int16_t *)(long)tmp_list;
+
+ for (index = 0; list[index] != -1; index++)
+ local->call_count++;
+
+ for (index = 0; list[index] != -1; index++) {
+ char need_break = (list[index+1] == -1);
+ STACK_WIND (frame,
+ unify_buf_cbk,
+ priv->xl_array[list[index]],
+ priv->xl_array[list[index]]->fops->stat,
+ loc);
+ if (need_break)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * unify_access_cbk -
+ */
+int32_t
+unify_access_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+
+/**
+ * unify_access - Send request to only namespace, which has all the
+ * attributes set for the file.
+ */
+int32_t
+unify_access (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t mask)
+{
+ UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ STACK_WIND (frame,
+ unify_access_cbk,
+ NS(this),
+ NS(this)->fops->access,
+ loc,
+ mask);
+
+ return 0;
+}
+
+int32_t
+unify_mkdir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf)
+{
+ int32_t callcnt = 0;
+ unify_private_t *priv = this->private;
+ unify_local_t *local = frame->local;
+ inode_t *tmp_inode = NULL;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if ((op_ret == -1) && !(priv->optimist &&
+ (op_errno == ENOENT ||
+ op_errno == EEXIST))) {
+ /* TODO: Decrement the inode_generation of
+ * this->inode's parent inode, hence the missing
+ * directory is created properly by self-heal.
+ * Currently, there is no way to get the parent
+ * inode directly.
+ */
+ gf_log (this->name, GF_LOG_ERROR,
+ "child(%s): path(%s): %s",
+ priv->xl_array[(long)cookie]->name,
+ local->loc1.path, strerror (op_errno));
+ if (op_errno != EEXIST)
+ local->failed = 1;
+ local->op_errno = op_errno;
+ }
+
+ if (op_ret >= 0)
+ local->op_ret = 0;
+
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (!local->failed) {
+ inode_ctx_put (local->loc1.inode, this,
+ priv->inode_generation);
+ }
+
+ tmp_inode = local->loc1.inode;
+ unify_local_wipe (local);
+
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ tmp_inode, &local->stbuf);
+ }
+
+ return 0;
+}
+
+/**
+ * unify_ns_mkdir_cbk -
+ */
+int32_t
+unify_ns_mkdir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf)
+{
+ unify_private_t *priv = this->private;
+ unify_local_t *local = frame->local;
+ long index = 0;
+
+ if (op_ret == -1) {
+ /* No need to send mkdir request to other servers,
+ * as namespace action failed
+ */
+ gf_log (this->name, GF_LOG_ERROR,
+ "namespace: path(%s): %s",
+ local->name, strerror (op_errno));
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, op_ret, op_errno, inode, NULL);
+ return 0;
+ }
+
+ /* Create one inode for this entry */
+ local->op_ret = 0;
+ local->stbuf = *buf;
+
+ local->call_count = priv->child_count;
+
+ /* Send mkdir request to all the nodes now */
+ for (index = 0; index < priv->child_count; index++) {
+ STACK_WIND_COOKIE (frame,
+ unify_mkdir_cbk,
+ (void *)index, //cookie
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->mkdir,
+ &local->loc1,
+ local->mode);
+ }
+
+ return 0;
+}
+
+
+/**
+ * unify_mkdir -
+ */
+int32_t
+unify_mkdir (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ mode_t mode)
+{
+ unify_local_t *local = NULL;
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+ local->mode = mode;
+
+ loc_copy (&local->loc1, loc);
+
+ if (local->loc1.path == NULL) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
+ STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL);
+ return 0;
+ }
+
+ STACK_WIND (frame,
+ unify_ns_mkdir_cbk,
+ NS(this),
+ NS(this)->fops->mkdir,
+ loc,
+ mode);
+ return 0;
+}
+
+/**
+ * unify_rmdir_cbk -
+ */
+int32_t
+unify_rmdir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int32_t callcnt = 0;
+ unify_private_t *priv = this->private;
+ unify_local_t *local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret == 0 || (priv->optimist && (op_errno == ENOENT)))
+ local->op_ret = 0;
+ if (op_ret == -1)
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno);
+ }
+
+ return 0;
+}
+
+/**
+ * unify_ns_rmdir_cbk -
+ */
+int32_t
+unify_ns_rmdir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int16_t index = 0;
+ unify_private_t *priv = this->private;
+ unify_local_t *local = frame->local;
+
+ if (op_ret == -1) {
+ /* No need to send rmdir request to other servers,
+ * as namespace action failed
+ */
+ gf_log (this->name,
+ ((op_errno != ENOTEMPTY) ?
+ GF_LOG_ERROR : GF_LOG_DEBUG),
+ "namespace: path(%s): %s",
+ local->loc1.path, strerror (op_errno));
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+ }
+
+ local->call_count = priv->child_count;
+
+ for (index = 0; index < priv->child_count; index++) {
+ STACK_WIND (frame,
+ unify_rmdir_cbk,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->rmdir,
+ &local->loc1);
+ }
+
+ return 0;
+}
+
+/**
+ * unify_rmdir -
+ */
+int32_t
+unify_rmdir (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ unify_local_t *local = NULL;
+
+ UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+
+ loc_copy (&local->loc1, loc);
+ if (local->loc1.path == NULL) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
+ STACK_UNWIND (frame, -1, ENOMEM);
+ return 0;
+ }
+
+ STACK_WIND (frame,
+ unify_ns_rmdir_cbk,
+ NS(this),
+ NS(this)->fops->rmdir,
+ loc);
+
+ return 0;
+}
+
+/**
+ * unify_open_cbk -
+ */
+int32_t
+unify_open_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd)
+{
+ int32_t callcnt = 0;
+ unify_local_t *local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret >= 0) {
+ local->op_ret = op_ret;
+ if (NS(this) != (xlator_t *)cookie) {
+ /* Store child node's ptr, used in
+ all the f*** / FileIO calls */
+ fd_ctx_set (fd, this, (uint64_t)(long)cookie);
+ }
+ }
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ local->failed = 1;
+ }
+ callcnt = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if ((local->failed == 1) && (local->op_ret >= 0)) {
+ local->call_count = 1;
+ /* return -1 to user */
+ local->op_ret = -1;
+ //local->op_errno = EIO;
+
+ if (!fd_ctx_get (local->fd, this, NULL)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Open success on child node, "
+ "failed on namespace");
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Open success on namespace, "
+ "failed on child node");
+ }
+ }
+
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, local->op_ret,
+ local->op_errno, local->fd);
+ }
+
+ return 0;
+}
+
+#ifdef GF_DARWIN_HOST_OS
+/**
+ * unify_create_lookup_cbk -
+ */
+int32_t
+unify_open_lookup_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf,
+ dict_t *dict)
+{
+ int32_t callcnt = 0;
+ int16_t index = 0;
+ unify_private_t *priv = this->private;
+ unify_local_t *local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if ((op_ret == -1) && (op_errno != ENOENT)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "child(%s): path(%s): %s",
+ priv->xl_array[(long)cookie]->name,
+ local->loc1.path, strerror (op_errno));
+ local->op_errno = op_errno;
+ }
+
+ if (op_ret >= 0) {
+ local->op_ret = op_ret;
+ local->index++;
+ if (NS(this) == priv->xl_array[(long)cookie]) {
+ local->list[0] = (int16_t)(long)cookie;
+ } else {
+ local->list[1] = (int16_t)(long)cookie;
+ }
+ if (S_ISDIR (buf->st_mode))
+ local->failed = 1;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ int16_t file_list[3] = {0,};
+ local->op_ret = -1;
+
+ file_list[0] = local->list[0];
+ file_list[1] = local->list[1];
+ file_list[2] = -1;
+
+ if (local->index != 2) {
+ /* Lookup failed, can't do open */
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: present on %d nodes",
+ local->name, local->index);
+
+ if (local->index < 2) {
+ unify_local_wipe (local);
+ gf_log (this->name, GF_LOG_ERROR,
+ "returning as file found on less "
+ "than 2 nodes");
+ STACK_UNWIND (frame, local->op_ret,
+ local->op_errno, local->fd);
+ return 0;
+ }
+ }
+
+ if (local->failed) {
+ /* Open on directory, return EISDIR */
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, -1, EISDIR, local->fd);
+ return 0;
+ }
+
+ /* Everything is perfect :) */
+ local->call_count = 2;
+
+ for (index = 0; file_list[index] != -1; index++) {
+ char need_break = (file_list[index+1] == -1);
+ STACK_WIND_COOKIE (frame,
+ unify_open_cbk,
+ priv->xl_array[file_list[index]],
+ priv->xl_array[file_list[index]],
+ priv->xl_array[file_list[index]]->fops->open,
+ &local->loc1,
+ local->flags,
+ local->fd);
+ if (need_break)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int32_t
+unify_open_readlink_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ const char *path)
+{
+ int16_t index = 0;
+ unify_private_t *priv = this->private;
+ unify_local_t *local = frame->local;
+
+ if (op_ret == -1) {
+ STACK_UNWIND (frame, -1, ENOENT);
+ return 0;
+ }
+
+ if (path[0] == '/') {
+ local->name = strdup (path);
+ ERR_ABORT (local->name);
+ } else {
+ char *tmp_str = strdup (local->loc1.path);
+ char *tmp_base = dirname (tmp_str);
+ local->name = CALLOC (1, ZR_PATH_MAX);
+ strcpy (local->name, tmp_base);
+ strncat (local->name, "/", 1);
+ strcat (local->name, path);
+ FREE (tmp_str);
+ }
+
+ local->list = CALLOC (1, sizeof (int16_t) * 3);
+ ERR_ABORT (local->list);
+ local->call_count = priv->child_count + 1;
+ local->op_ret = -1;
+ for (index = 0; index <= priv->child_count; index++) {
+ /* Send the lookup to all the nodes including namespace */
+ STACK_WIND_COOKIE (frame,
+ unify_open_lookup_cbk,
+ (void *)(long)index,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->lookup,
+ &local->loc1,
+ NULL);
+ }
+
+ return 0;
+}
+#endif /* GF_DARWIN_HOST_OS */
+
+/**
+ * unify_open -
+ */
+int32_t
+unify_open (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags,
+ fd_t *fd)
+{
+ unify_private_t *priv = this->private;
+ unify_local_t *local = NULL;
+ int16_t *list = NULL;
+ int16_t index = 0;
+ int16_t file_list[3] = {0,};
+ uint64_t tmp_list = 0;
+
+ UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ /* Init */
+ INIT_LOCAL (frame, local);
+ loc_copy (&local->loc1, loc);
+ local->fd = fd;
+ local->flags = flags;
+ inode_ctx_get (loc->inode, this, &tmp_list);
+ list = (int16_t *)(long)tmp_list;
+
+ local->list = list;
+ file_list[0] = priv->child_count; /* Thats namespace */
+ file_list[2] = -1;
+ for (index = 0; list[index] != -1; index++) {
+ local->call_count++;
+ if (list[index] != priv->child_count)
+ file_list[1] = list[index];
+ }
+
+ if (local->call_count != 2) {
+ /* If the lookup was done for file */
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: entry_count is %d",
+ loc->path, local->call_count);
+ for (index = 0; local->list[index] != -1; index++)
+ gf_log (this->name, GF_LOG_ERROR, "%s: found on %s",
+ loc->path, priv->xl_array[list[index]]->name);
+
+ if (local->call_count < 2) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "returning EIO as file found on onlyone node");
+ STACK_UNWIND (frame, -1, EIO, fd);
+ return 0;
+ }
+ }
+
+#ifdef GF_DARWIN_HOST_OS
+ /* Handle symlink here */
+ if (S_ISLNK (loc->inode->st_mode)) {
+ /* Callcount doesn't matter here */
+ STACK_WIND (frame,
+ unify_open_readlink_cbk,
+ NS(this),
+ NS(this)->fops->readlink,
+ loc, ZR_PATH_MAX);
+ return 0;
+ }
+#endif /* GF_DARWIN_HOST_OS */
+
+ local->call_count = 2;
+ for (index = 0; file_list[index] != -1; index++) {
+ char need_break = (file_list[index+1] == -1);
+ STACK_WIND_COOKIE (frame,
+ unify_open_cbk,
+ priv->xl_array[file_list[index]], //cookie
+ priv->xl_array[file_list[index]],
+ priv->xl_array[file_list[index]]->fops->open,
+ loc,
+ flags,
+ fd);
+ if (need_break)
+ break;
+ }
+
+ return 0;
+}
+
+
+int32_t
+unify_create_unlink_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ unify_local_t *local = frame->local;
+ inode_t *inode = local->loc1.inode;
+
+ unify_local_wipe (local);
+
+ STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd,
+ inode, &local->stbuf);
+
+ return 0;
+}
+
+/**
+ * unify_create_open_cbk -
+ */
+int32_t
+unify_create_open_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd)
+{
+ int ret = 0;
+ int32_t callcnt = 0;
+ unify_local_t *local = frame->local;
+ inode_t *inode = NULL;
+ xlator_t *child = NULL;
+ uint64_t tmp_value = 0;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret >= 0) {
+ local->op_ret = op_ret;
+ if (NS(this) != (xlator_t *)cookie) {
+ /* Store child node's ptr, used in all
+ the f*** / FileIO calls */
+ /* TODO: log on failure */
+ ret = fd_ctx_get (fd, this, &tmp_value);
+ cookie = (void *)(long)tmp_value;
+ } else {
+ /* NOTE: open successful on namespace.
+ * fd's ctx can be used to identify open
+ * failure on storage subvolume. cool
+ * ide ;) */
+ local->failed = 0;
+ }
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "child(%s): path(%s): %s",
+ ((xlator_t *)cookie)->name,
+ local->loc1.path, strerror (op_errno));
+ local->op_errno = op_errno;
+ local->failed = 1;
+ }
+ callcnt = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (local->failed == 1 && (local->op_ret >= 0)) {
+ local->call_count = 1;
+ /* return -1 to user */
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ local->fd = fd;
+ local->call_count = 1;
+
+ if (!fd_ctx_get (local->fd, this, &tmp_value)) {
+ child = (xlator_t *)(long)tmp_value;
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "Create success on child node, "
+ "failed on namespace");
+
+ STACK_WIND (frame,
+ unify_create_unlink_cbk,
+ child,
+ child->fops->unlink,
+ &local->loc1);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Create success on namespace, "
+ "failed on child node");
+
+ STACK_WIND (frame,
+ unify_create_unlink_cbk,
+ NS(this),
+ NS(this)->fops->unlink,
+ &local->loc1);
+ }
+ return 0;
+ }
+ inode = local->loc1.inode;
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno, fd,
+ inode, &local->stbuf);
+ }
+ return 0;
+}
+
+/**
+ * unify_create_lookup_cbk -
+ */
+int32_t
+unify_create_lookup_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf,
+ dict_t *dict)
+{
+ int32_t callcnt = 0;
+ int16_t index = 0;
+ unify_private_t *priv = this->private;
+ unify_local_t *local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "child(%s): path(%s): %s",
+ priv->xl_array[(long)cookie]->name,
+ local->loc1.path, strerror (op_errno));
+ local->op_errno = op_errno;
+ local->failed = 1;
+ }
+
+ if (op_ret >= 0) {
+ local->op_ret = op_ret;
+ local->list[local->index++] = (int16_t)(long)cookie;
+ if (NS(this) == priv->xl_array[(long)cookie]) {
+ local->st_ino = buf->st_ino;
+ } else {
+ local->stbuf = *buf;
+ }
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ int16_t *list = local->list;
+ int16_t file_list[3] = {0,};
+ local->op_ret = -1;
+
+ local->list [local->index] = -1;
+ file_list[0] = list[0];
+ file_list[1] = list[1];
+ file_list[2] = -1;
+
+ local->stbuf.st_ino = local->st_ino;
+ /* TODO: log on failure */
+ inode_ctx_put (local->loc1.inode, this,
+ (uint64_t)(long)local->list);
+
+ if (local->index != 2) {
+ /* Lookup failed, can't do open */
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: present on %d nodes",
+ local->loc1.path, local->index);
+ file_list[0] = priv->child_count;
+ for (index = 0; list[index] != -1; index++) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: found on %s", local->loc1.path,
+ priv->xl_array[list[index]]->name);
+ if (list[index] != priv->child_count)
+ file_list[1] = list[index];
+ }
+
+ if (local->index < 2) {
+ unify_local_wipe (local);
+ gf_log (this->name, GF_LOG_ERROR,
+ "returning EIO as file found on "
+ "only one node");
+ STACK_UNWIND (frame, -1, EIO,
+ local->fd, inode, NULL);
+ return 0;
+ }
+ }
+ /* Everything is perfect :) */
+ local->call_count = 2;
+
+ for (index = 0; file_list[index] != -1; index++) {
+ char need_break = (file_list[index+1] == -1);
+ STACK_WIND_COOKIE (frame,
+ unify_create_open_cbk,
+ priv->xl_array[file_list[index]],
+ priv->xl_array[file_list[index]],
+ priv->xl_array[file_list[index]]->fops->open,
+ &local->loc1,
+ local->flags,
+ local->fd);
+ if (need_break)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+/**
+ * unify_create_cbk -
+ */
+int32_t
+unify_create_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd,
+ inode_t *inode,
+ struct stat *buf)
+{
+ int ret = 0;
+ unify_local_t *local = frame->local;
+ call_frame_t *prev_frame = cookie;
+ inode_t *tmp_inode = NULL;
+
+ if (op_ret == -1) {
+ /* send unlink () on Namespace */
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ local->call_count = 1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "create failed on %s (file %s, error %s), "
+ "sending unlink to namespace",
+ prev_frame->this->name,
+ local->loc1.path, strerror (op_errno));
+
+ STACK_WIND (frame,
+ unify_create_unlink_cbk,
+ NS(this),
+ NS(this)->fops->unlink,
+ &local->loc1);
+
+ return 0;
+ }
+
+ if (op_ret >= 0) {
+ local->op_ret = op_ret;
+ local->stbuf = *buf;
+ /* Just inode number should be from NS node */
+ local->stbuf.st_ino = local->st_ino;
+
+ /* TODO: log on failure */
+ ret = fd_ctx_set (fd, this, (uint64_t)(long)prev_frame->this);
+ }
+
+ tmp_inode = local->loc1.inode;
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd,
+ tmp_inode, &local->stbuf);
+
+ return 0;
+}
+
+/**
+ * unify_ns_create_cbk -
+ *
+ */
+int32_t
+unify_ns_create_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd,
+ inode_t *inode,
+ struct stat *buf)
+{
+ struct sched_ops *sched_ops = NULL;
+ xlator_t *sched_xl = NULL;
+ unify_local_t *local = frame->local;
+ unify_private_t *priv = this->private;
+ int16_t *list = NULL;
+ int16_t index = 0;
+
+ if (op_ret == -1) {
+ /* No need to send create request to other servers, as
+ namespace action failed. Handle exclusive create here. */
+ if ((op_errno != EEXIST) ||
+ ((op_errno == EEXIST) &&
+ ((local->flags & O_EXCL) == O_EXCL))) {
+ /* If its just a create call without O_EXCL,
+ don't do this */
+ gf_log (this->name, GF_LOG_ERROR,
+ "namespace: path(%s): %s",
+ local->loc1.path, strerror (op_errno));
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf);
+ return 0;
+ }
+ }
+
+ if (op_ret >= 0) {
+ /* Get the inode number from the NS node */
+ local->st_ino = buf->st_ino;
+
+ local->op_ret = -1;
+
+ /* Start the mapping list */
+ list = CALLOC (1, sizeof (int16_t) * 3);
+ ERR_ABORT (list);
+ inode_ctx_put (inode, this, (uint64_t)(long)list);
+ list[0] = priv->child_count;
+ list[2] = -1;
+
+ /* This means, file doesn't exist anywhere in the Filesystem */
+ sched_ops = priv->sched_ops;
+
+ /* Send create request to the scheduled node now */
+ sched_xl = sched_ops->schedule (this, local->loc1.path);
+ if (sched_xl == NULL)
+ {
+ /* send unlink () on Namespace */
+ local->op_errno = ENOTCONN;
+ local->op_ret = -1;
+ local->call_count = 1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no node online to schedule create:(file %s) "
+ "sending unlink to namespace",
+ (local->loc1.path)?local->loc1.path:"");
+
+ STACK_WIND (frame,
+ unify_create_unlink_cbk,
+ NS(this),
+ NS(this)->fops->unlink,
+ &local->loc1);
+
+ return 0;
+ }
+
+ for (index = 0; index < priv->child_count; index++)
+ if (sched_xl == priv->xl_array[index])
+ break;
+ list[1] = index;
+
+ STACK_WIND (frame, unify_create_cbk,
+ sched_xl, sched_xl->fops->create,
+ &local->loc1, local->flags, local->mode, fd);
+ } else {
+ /* File already exists, and there is no O_EXCL flag */
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "File(%s) already exists on namespace, sending "
+ "open instead", local->loc1.path);
+
+ local->list = CALLOC (1, sizeof (int16_t) * 3);
+ ERR_ABORT (local->list);
+ local->call_count = priv->child_count + 1;
+ local->op_ret = -1;
+ for (index = 0; index <= priv->child_count; index++) {
+ /* Send lookup() to all nodes including namespace */
+ STACK_WIND_COOKIE (frame,
+ unify_create_lookup_cbk,
+ (void *)(long)index,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->lookup,
+ &local->loc1,
+ NULL);
+ }
+ }
+ return 0;
+}
+
+/**
+ * unify_create - create a file in global namespace first, so other
+ * clients can see them. Create the file in storage nodes in background.
+ */
+int32_t
+unify_create (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags,
+ mode_t mode,
+ fd_t *fd)
+{
+ unify_local_t *local = NULL;
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+ local->mode = mode;
+ local->flags = flags;
+ local->fd = fd;
+
+ loc_copy (&local->loc1, loc);
+ if (local->loc1.path == NULL) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
+ STACK_UNWIND (frame, -1, ENOMEM, fd, loc->inode, NULL);
+ return 0;
+ }
+
+ STACK_WIND (frame,
+ unify_ns_create_cbk,
+ NS(this),
+ NS(this)->fops->create,
+ loc,
+ flags | O_EXCL,
+ mode,
+ fd);
+
+ return 0;
+}
+
+
+/**
+ * unify_opendir_cbk -
+ */
+int32_t
+unify_opendir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, fd);
+
+ return 0;
+}
+
+/**
+ * unify_opendir -
+ */
+int32_t
+unify_opendir (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ fd_t *fd)
+{
+ UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ STACK_WIND (frame, unify_opendir_cbk,
+ NS(this), NS(this)->fops->opendir, loc, fd);
+
+ return 0;
+}
+
+
+/**
+ * unify_chmod -
+ */
+int32_t
+unify_chmod (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ mode_t mode)
+{
+ unify_local_t *local = NULL;
+ unify_private_t *priv = this->private;
+ int32_t index = 0;
+ int32_t callcnt = 0;
+ uint64_t tmp_list = 0;
+
+ UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+
+ loc_copy (&local->loc1, loc);
+ local->st_ino = loc->inode->ino;
+
+ if (S_ISDIR (loc->inode->st_mode)) {
+ local->call_count = priv->child_count + 1;
+
+ for (index = 0; index < (priv->child_count + 1); index++) {
+ STACK_WIND (frame,
+ unify_buf_cbk,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->chmod,
+ loc, mode);
+ }
+ } else {
+ inode_ctx_get (loc->inode, this, &tmp_list);
+ local->list = (int16_t *)(long)tmp_list;
+
+ for (index = 0; local->list[index] != -1; index++) {
+ local->call_count++;
+ callcnt++;
+ }
+
+ for (index = 0; local->list[index] != -1; index++) {
+ STACK_WIND (frame,
+ unify_buf_cbk,
+ priv->xl_array[local->list[index]],
+ priv->xl_array[local->list[index]]->fops->chmod,
+ loc,
+ mode);
+ if (!--callcnt)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * unify_chown -
+ */
+int32_t
+unify_chown (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ uid_t uid,
+ gid_t gid)
+{
+ unify_local_t *local = NULL;
+ unify_private_t *priv = this->private;
+ int32_t index = 0;
+ int32_t callcnt = 0;
+ uint64_t tmp_list = 0;
+
+ UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+ loc_copy (&local->loc1, loc);
+ local->st_ino = loc->inode->ino;
+
+ if (S_ISDIR (loc->inode->st_mode)) {
+ local->call_count = priv->child_count + 1;
+
+ for (index = 0; index < (priv->child_count + 1); index++) {
+ STACK_WIND (frame,
+ unify_buf_cbk,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->chown,
+ loc, uid, gid);
+ }
+ } else {
+ inode_ctx_get (loc->inode, this, &tmp_list);
+ local->list = (int16_t *)(long)tmp_list;
+
+ for (index = 0; local->list[index] != -1; index++) {
+ local->call_count++;
+ callcnt++;
+ }
+
+ for (index = 0; local->list[index] != -1; index++) {
+ STACK_WIND (frame,
+ unify_buf_cbk,
+ priv->xl_array[local->list[index]],
+ priv->xl_array[local->list[index]]->fops->chown,
+ loc, uid, gid);
+ if (!--callcnt)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+/**
+ * unify_truncate_cbk -
+ */
+int32_t
+unify_truncate_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ int32_t callcnt = 0;
+ unify_private_t *priv = this->private;
+ unify_local_t *local = frame->local;
+ call_frame_t *prev_frame = cookie;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "child(%s): path(%s): %s",
+ prev_frame->this->name,
+ (local->loc1.path)?local->loc1.path:"",
+ strerror (op_errno));
+ local->op_errno = op_errno;
+ if (!((op_errno == ENOENT) && priv->optimist))
+ local->op_ret = -1;
+ }
+
+ if (op_ret >= 0) {
+ if (NS (this) == prev_frame->this) {
+ local->st_ino = buf->st_ino;
+ /* If the entry is directory, get the
+ stat from NS node */
+ if (S_ISDIR (buf->st_mode) ||
+ !local->stbuf.st_blksize) {
+ local->stbuf = *buf;
+ }
+ }
+
+ if ((!S_ISDIR (buf->st_mode)) &&
+ (NS (this) != prev_frame->this)) {
+ /* If file, take the stat info from
+ Storage node. */
+ local->stbuf = *buf;
+ }
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (local->st_ino)
+ local->stbuf.st_ino = local->st_ino;
+ else
+ local->op_ret = -1;
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->stbuf);
+ }
+
+ return 0;
+}
+
+/**
+ * unify_truncate -
+ */
+int32_t
+unify_truncate (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ off_t offset)
+{
+ unify_local_t *local = NULL;
+ unify_private_t *priv = this->private;
+ int32_t index = 0;
+ int32_t callcnt = 0;
+ uint64_t tmp_list = 0;
+
+ UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+ loc_copy (&local->loc1, loc);
+ local->st_ino = loc->inode->ino;
+
+ if (S_ISDIR (loc->inode->st_mode)) {
+ local->call_count = 1;
+
+ STACK_WIND (frame,
+ unify_buf_cbk,
+ NS(this),
+ NS(this)->fops->stat,
+ loc);
+ } else {
+ local->op_ret = 0;
+ inode_ctx_get (loc->inode, this, &tmp_list);
+ local->list = (int16_t *)(long)tmp_list;
+
+ for (index = 0; local->list[index] != -1; index++) {
+ local->call_count++;
+ callcnt++;
+ }
+
+ /* Don't send truncate to NS node */
+ STACK_WIND (frame, unify_truncate_cbk, NS(this),
+ NS(this)->fops->stat, loc);
+ callcnt--;
+
+ for (index = 0; local->list[index] != -1; index++) {
+ if (NS(this) != priv->xl_array[local->list[index]]) {
+ STACK_WIND (frame,
+ unify_truncate_cbk,
+ priv->xl_array[local->list[index]],
+ priv->xl_array[local->list[index]]->fops->truncate,
+ loc,
+ offset);
+ if (!--callcnt)
+ break;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * unify_utimens -
+ */
+int32_t
+unify_utimens (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ struct timespec tv[2])
+{
+ unify_local_t *local = NULL;
+ unify_private_t *priv = this->private;
+ int32_t index = 0;
+ int32_t callcnt = 0;
+ uint64_t tmp_list = 0;
+
+ UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+ loc_copy (&local->loc1, loc);
+ local->st_ino = loc->inode->ino;
+
+ if (S_ISDIR (loc->inode->st_mode)) {
+ local->call_count = priv->child_count + 1;
+
+ for (index = 0; index < (priv->child_count + 1); index++) {
+ STACK_WIND (frame,
+ unify_buf_cbk,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->utimens,
+ loc, tv);
+ }
+ } else {
+ inode_ctx_get (loc->inode, this, &tmp_list);
+ local->list = (int16_t *)(long)tmp_list;
+
+ for (index = 0; local->list[index] != -1; index++) {
+ local->call_count++;
+ callcnt++;
+ }
+
+ for (index = 0; local->list[index] != -1; index++) {
+ STACK_WIND (frame,
+ unify_buf_cbk,
+ priv->xl_array[local->list[index]],
+ priv->xl_array[local->list[index]]->fops->utimens,
+ loc,
+ tv);
+ if (!--callcnt)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * unify_readlink_cbk -
+ */
+int32_t
+unify_readlink_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ const char *path)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, path);
+ return 0;
+}
+
+/**
+ * unify_readlink - Read the link only from the storage node.
+ */
+int32_t
+unify_readlink (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ size_t size)
+{
+ unify_private_t *priv = this->private;
+ int32_t entry_count = 0;
+ int16_t *list = NULL;
+ int16_t index = 0;
+ uint64_t tmp_list = 0;
+
+ UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ inode_ctx_get (loc->inode, this, &tmp_list);
+ list = (int16_t *)(long)tmp_list;
+
+ for (index = 0; list[index] != -1; index++)
+ entry_count++;
+
+ if (entry_count >= 2) {
+ for (index = 0; list[index] != -1; index++) {
+ if (priv->xl_array[list[index]] != NS(this)) {
+ STACK_WIND (frame,
+ unify_readlink_cbk,
+ priv->xl_array[list[index]],
+ priv->xl_array[list[index]]->fops->readlink,
+ loc,
+ size);
+ break;
+ }
+ }
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "returning ENOENT, no softlink files found "
+ "on storage node");
+ STACK_UNWIND (frame, -1, ENOENT, NULL);
+ }
+
+ return 0;
+}
+
+
+/**
+ * unify_unlink_cbk -
+ */
+int32_t
+unify_unlink_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int32_t callcnt = 0;
+ unify_private_t *priv = this->private;
+ unify_local_t *local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret == 0 || ((op_errno == ENOENT) && priv->optimist))
+ local->op_ret = 0;
+ if (op_ret == -1)
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno);
+ }
+
+ return 0;
+}
+
+
+/**
+ * unify_unlink -
+ */
+int32_t
+unify_unlink (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc)
+{
+ unify_private_t *priv = this->private;
+ unify_local_t *local = NULL;
+ int16_t *list = NULL;
+ int16_t index = 0;
+ uint64_t tmp_list = 0;
+
+ UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+ loc_copy (&local->loc1, loc);
+
+ inode_ctx_get (loc->inode, this, &tmp_list);
+ list = (int16_t *)(long)tmp_list;
+
+ for (index = 0; list[index] != -1; index++)
+ local->call_count++;
+
+ if (local->call_count) {
+ for (index = 0; list[index] != -1; index++) {
+ char need_break = (list[index+1] == -1);
+ STACK_WIND (frame,
+ unify_unlink_cbk,
+ priv->xl_array[list[index]],
+ priv->xl_array[list[index]]->fops->unlink,
+ loc);
+ if (need_break)
+ break;
+ }
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: returning ENOENT", loc->path);
+ STACK_UNWIND (frame, -1, ENOENT);
+ }
+
+ return 0;
+}
+
+
+/**
+ * unify_readv_cbk -
+ */
+int32_t
+unify_readv_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vector,
+ int32_t count,
+ struct stat *stbuf)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf);
+ return 0;
+}
+
+/**
+ * unify_readv -
+ */
+int32_t
+unify_readv (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t offset)
+{
+ UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
+ xlator_t *child = NULL;
+ uint64_t tmp_child = 0;
+
+ fd_ctx_get (fd, this, &tmp_child);
+ child = (xlator_t *)(long)tmp_child;
+
+ STACK_WIND (frame,
+ unify_readv_cbk,
+ child,
+ child->fops->readv,
+ fd,
+ size,
+ offset);
+
+
+ return 0;
+}
+
+/**
+ * unify_writev_cbk -
+ */
+int32_t
+unify_writev_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *stbuf)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, stbuf);
+ return 0;
+}
+
+/**
+ * unify_writev -
+ */
+int32_t
+unify_writev (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ struct iovec *vector,
+ int32_t count,
+ off_t off)
+{
+ UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
+ xlator_t *child = NULL;
+ uint64_t tmp_child = 0;
+
+ fd_ctx_get (fd, this, &tmp_child);
+ child = (xlator_t *)(long)tmp_child;
+
+ STACK_WIND (frame,
+ unify_writev_cbk,
+ child,
+ child->fops->writev,
+ fd,
+ vector,
+ count,
+ off);
+
+ return 0;
+}
+
+/**
+ * unify_ftruncate -
+ */
+int32_t
+unify_ftruncate (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ off_t offset)
+{
+ xlator_t *child = NULL;
+ unify_local_t *local = NULL;
+ uint64_t tmp_child = 0;
+
+ UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(fd);
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+ local->op_ret = 0;
+
+ fd_ctx_get (fd, this, &tmp_child);
+ child = (xlator_t *)(long)tmp_child;
+
+ local->call_count = 2;
+
+ STACK_WIND (frame, unify_truncate_cbk,
+ child, child->fops->ftruncate,
+ fd, offset);
+
+ STACK_WIND (frame, unify_truncate_cbk,
+ NS(this), NS(this)->fops->fstat,
+ fd);
+
+ return 0;
+}
+
+
+/**
+ * unify_fchmod -
+ */
+int32_t
+unify_fchmod (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ mode_t mode)
+{
+ unify_local_t *local = NULL;
+ xlator_t *child = NULL;
+ uint64_t tmp_child = 0;
+
+ UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd);
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+ local->st_ino = fd->inode->ino;
+
+ if (!fd_ctx_get (fd, this, &tmp_child)) {
+ /* If its set, then its file */
+ child = (xlator_t *)(long)tmp_child;
+
+ local->call_count = 2;
+
+ STACK_WIND (frame, unify_buf_cbk, child,
+ child->fops->fchmod, fd, mode);
+
+ STACK_WIND (frame, unify_buf_cbk, NS(this),
+ NS(this)->fops->fchmod, fd, mode);
+
+ } else {
+ /* this is an directory */
+ local->call_count = 1;
+
+ STACK_WIND (frame, unify_buf_cbk,
+ NS(this), NS(this)->fops->fchmod, fd, mode);
+ }
+
+ return 0;
+}
+
+/**
+ * unify_fchown -
+ */
+int32_t
+unify_fchown (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ uid_t uid,
+ gid_t gid)
+{
+ unify_local_t *local = NULL;
+ xlator_t *child = NULL;
+ uint64_t tmp_child = 0;
+
+ UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd);
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+ local->st_ino = fd->inode->ino;
+
+ if (!fd_ctx_get (fd, this, &tmp_child)) {
+ /* If its set, then its file */
+ child = (xlator_t *)(long)tmp_child;
+
+ local->call_count = 2;
+
+ STACK_WIND (frame, unify_buf_cbk, child,
+ child->fops->fchown, fd, uid, gid);
+
+ STACK_WIND (frame, unify_buf_cbk, NS(this),
+ NS(this)->fops->fchown, fd, uid, gid);
+ } else {
+ local->call_count = 1;
+
+ STACK_WIND (frame, unify_buf_cbk,
+ NS(this), NS(this)->fops->fchown,
+ fd, uid, gid);
+ }
+
+ return 0;
+}
+
+/**
+ * unify_flush_cbk -
+ */
+int32_t
+unify_flush_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+/**
+ * unify_flush -
+ */
+int32_t
+unify_flush (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd)
+{
+ UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
+ xlator_t *child = NULL;
+ uint64_t tmp_child = 0;
+
+ fd_ctx_get (fd, this, &tmp_child);
+ child = (xlator_t *)(long)tmp_child;
+
+ STACK_WIND (frame, unify_flush_cbk, child,
+ child->fops->flush, fd);
+
+ return 0;
+}
+
+
+/**
+ * unify_fsync_cbk -
+ */
+int32_t
+unify_fsync_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+/**
+ * unify_fsync -
+ */
+int32_t
+unify_fsync (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t flags)
+{
+ UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
+ xlator_t *child = NULL;
+ uint64_t tmp_child = 0;
+
+ fd_ctx_get (fd, this, &tmp_child);
+ child = (xlator_t *)(long)tmp_child;
+
+ STACK_WIND (frame, unify_fsync_cbk, child,
+ child->fops->fsync, fd, flags);
+
+ return 0;
+}
+
+/**
+ * unify_fstat - Send fstat FOP to Namespace only if its directory, and to
+ * both namespace and the storage node if its a file.
+ */
+int32_t
+unify_fstat (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd)
+{
+ unify_local_t *local = NULL;
+ xlator_t *child = NULL;
+ uint64_t tmp_child = 0;
+
+ UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd);
+
+ INIT_LOCAL (frame, local);
+ local->st_ino = fd->inode->ino;
+
+ if (!fd_ctx_get (fd, this, &tmp_child)) {
+ /* If its set, then its file */
+ child = (xlator_t *)(long)tmp_child;
+ local->call_count = 2;
+
+ STACK_WIND (frame, unify_buf_cbk, child,
+ child->fops->fstat, fd);
+
+ STACK_WIND (frame, unify_buf_cbk, NS(this),
+ NS(this)->fops->fstat, fd);
+
+ } else {
+ /* this is an directory */
+ local->call_count = 1;
+ STACK_WIND (frame, unify_buf_cbk, NS(this),
+ NS(this)->fops->fstat, fd);
+ }
+
+ return 0;
+}
+
+/**
+ * unify_getdents_cbk -
+ */
+int32_t
+unify_getdents_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dir_entry_t *entry,
+ int32_t count)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, entry, count);
+ return 0;
+}
+
+/**
+ * unify_getdents - send the FOP request to all the nodes.
+ */
+int32_t
+unify_getdents (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t offset,
+ int32_t flag)
+{
+ UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd);
+
+ STACK_WIND (frame, unify_getdents_cbk, NS(this),
+ NS(this)->fops->getdents, fd, size, offset, flag);
+
+ return 0;
+}
+
+
+/**
+ * unify_readdir_cbk -
+ */
+int32_t
+unify_readdir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ gf_dirent_t *buf)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, buf);
+
+ return 0;
+}
+
+/**
+ * unify_readdir - send the FOP request to all the nodes.
+ */
+int32_t
+unify_readdir (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t offset)
+{
+ UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd);
+
+ STACK_WIND (frame, unify_readdir_cbk, NS(this),
+ NS(this)->fops->readdir, fd, size, offset);
+
+ return 0;
+}
+
+
+/**
+ * unify_fsyncdir_cbk -
+ */
+int32_t
+unify_fsyncdir_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ STACK_UNWIND (frame, op_ret, op_errno);
+
+ return 0;
+}
+
+/**
+ * unify_fsyncdir -
+ */
+int32_t
+unify_fsyncdir (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t flags)
+{
+ UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd);
+
+ STACK_WIND (frame, unify_fsyncdir_cbk,
+ NS(this), NS(this)->fops->fsyncdir, fd, flags);
+
+ return 0;
+}
+
+/**
+ * unify_lk_cbk - UNWIND frame with the proper return arguments.
+ */
+int32_t
+unify_lk_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct flock *lock)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, lock);
+ return 0;
+}
+
+/**
+ * unify_lk - Send it to all the storage nodes, (should be 1) which has file.
+ */
+int32_t
+unify_lk (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t cmd,
+ struct flock *lock)
+{
+ UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
+ xlator_t *child = NULL;
+ uint64_t tmp_child = 0;
+
+ fd_ctx_get (fd, this, &tmp_child);
+ child = (xlator_t *)(long)tmp_child;
+
+ STACK_WIND (frame, unify_lk_cbk, child,
+ child->fops->lk, fd, cmd, lock);
+
+ return 0;
+}
+
+
+int32_t
+unify_setxattr_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno);
+
+static int32_t
+unify_setxattr_file_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ unify_private_t *private = this->private;
+ unify_local_t *local = frame->local;
+ xlator_t *sched_xl = NULL;
+ struct sched_ops *sched_ops = NULL;
+
+ if (op_ret == -1) {
+ if (!ENOTSUP)
+ gf_log (this->name, GF_LOG_ERROR,
+ "setxattr with XATTR_CREATE on ns: "
+ "path(%s) key(%s): %s",
+ local->loc1.path, local->name,
+ strerror (op_errno));
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+ }
+
+ LOCK (&frame->lock);
+ {
+ local->failed = 0;
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->call_count = 1;
+ }
+ UNLOCK (&frame->lock);
+
+ /* schedule XATTR_CREATE on one of the child node */
+ sched_ops = private->sched_ops;
+
+ /* Send create request to the scheduled node now */
+ sched_xl = sched_ops->schedule (this, local->name);
+ if (!sched_xl) {
+ STACK_UNWIND (frame, -1, ENOTCONN);
+ return 0;
+ }
+
+ STACK_WIND (frame,
+ unify_setxattr_cbk,
+ sched_xl,
+ sched_xl->fops->setxattr,
+ &local->loc1,
+ local->dict,
+ local->flags);
+ return 0;
+}
+
+/**
+ * unify_setxattr_cbk - When all the child nodes return, UNWIND frame.
+ */
+int32_t
+unify_setxattr_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int32_t callcnt = 0;
+ unify_local_t *local = frame->local;
+ call_frame_t *prev_frame = cookie;
+ dict_t *dict = NULL;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ gf_log (this->name, (((op_errno == ENOENT) ||
+ (op_errno == ENOTSUP))?
+ GF_LOG_DEBUG : GF_LOG_ERROR),
+ "child(%s): path(%s): %s",
+ prev_frame->this->name,
+ (local->loc1.path)?local->loc1.path:"",
+ strerror (op_errno));
+ if (local->failed == -1) {
+ local->failed = 1;
+ }
+ local->op_errno = op_errno;
+ } else {
+ local->failed = 0;
+ local->op_ret = op_ret;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (local->failed && local->name &&
+ ZR_FILE_CONTENT_REQUEST(local->name)) {
+ dict = get_new_dict ();
+ dict_set (dict, local->dict->members_list->key,
+ data_from_dynptr(NULL, 0));
+ dict_ref (dict);
+
+ local->call_count = 1;
+
+ STACK_WIND (frame,
+ unify_setxattr_file_cbk,
+ NS(this),
+ NS(this)->fops->setxattr,
+ &local->loc1,
+ dict,
+ XATTR_CREATE);
+
+ dict_unref (dict);
+ return 0;
+ }
+
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno);
+ }
+
+ return 0;
+}
+
+/**
+ * unify_sexattr - This function should be sent to all the storage nodes,
+ * which contains the file, (excluding namespace).
+ */
+int32_t
+unify_setxattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ dict_t *dict,
+ int32_t flags)
+{
+ unify_private_t *priv = this->private;
+ unify_local_t *local = NULL;
+ int16_t *list = NULL;
+ int16_t index = 0;
+ int32_t call_count = 0;
+ uint64_t tmp_list = 0;
+ data_pair_t *trav = dict->members_list;
+
+ UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+ local->failed = -1;
+ loc_copy (&local->loc1, loc);
+
+ if (S_ISDIR (loc->inode->st_mode)) {
+
+ if (trav && trav->key && ZR_FILE_CONTENT_REQUEST(trav->key)) {
+ /* direct the storage xlators to change file
+ content only if file exists */
+ local->flags = flags;
+ local->dict = dict;
+ local->name = strdup (trav->key);
+ flags |= XATTR_REPLACE;
+ }
+
+ local->call_count = priv->child_count;
+ for (index = 0; index < priv->child_count; index++) {
+ STACK_WIND (frame,
+ unify_setxattr_cbk,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->setxattr,
+ loc, dict, flags);
+ }
+ return 0;
+ }
+
+ inode_ctx_get (loc->inode, this, &tmp_list);
+ list = (int16_t *)(long)tmp_list;
+
+ for (index = 0; list[index] != -1; index++) {
+ if (NS(this) != priv->xl_array[list[index]]) {
+ local->call_count++;
+ call_count++;
+ }
+ }
+
+ if (local->call_count) {
+ for (index = 0; list[index] != -1; index++) {
+ if (priv->xl_array[list[index]] != NS(this)) {
+ STACK_WIND (frame,
+ unify_setxattr_cbk,
+ priv->xl_array[list[index]],
+ priv->xl_array[list[index]]->fops->setxattr,
+ loc,
+ dict,
+ flags);
+ if (!--call_count)
+ break;
+ }
+ }
+ return 0;
+ }
+
+ /* No entry in storage nodes */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "returning ENOENT, file not found on storage node.");
+ STACK_UNWIND (frame, -1, ENOENT);
+
+ return 0;
+}
+
+
+/**
+ * unify_getxattr_cbk - This function is called from only one child, so, no
+ * need of any lock or anything else, just send it to above layer
+ */
+int32_t
+unify_getxattr_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *value)
+{
+ int32_t callcnt = 0;
+ dict_t *local_value = NULL;
+ unify_local_t *local = frame->local;
+ call_frame_t *prev_frame = cookie;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ gf_log (this->name,
+ (((op_errno == ENOENT) ||
+ (op_errno == ENODATA) ||
+ (op_errno == ENOTSUP)) ?
+ GF_LOG_DEBUG : GF_LOG_ERROR),
+ "child(%s): path(%s): %s",
+ prev_frame->this->name,
+ (local->loc1.path)?local->loc1.path:"",
+ strerror (op_errno));
+ } else {
+ if (!local->dict)
+ local->dict = dict_ref (value);
+ local->op_ret = op_ret;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ local_value = local->dict;
+ local->dict = NULL;
+
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local_value);
+
+ if (local_value)
+ dict_unref (local_value);
+ }
+
+ return 0;
+}
+
+
+/**
+ * unify_getxattr - This FOP is sent to only the storage node.
+ */
+int32_t
+unify_getxattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ const char *name)
+{
+ unify_private_t *priv = this->private;
+ int16_t *list = NULL;
+ int16_t index = 0;
+ int16_t count = 0;
+ unify_local_t *local = NULL;
+ uint64_t tmp_list = 0;
+
+ UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+ INIT_LOCAL (frame, local);
+
+ if (S_ISDIR (loc->inode->st_mode)) {
+ local->call_count = priv->child_count;
+ for (index = 0; index < priv->child_count; index++)
+ STACK_WIND (frame,
+ unify_getxattr_cbk,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->getxattr,
+ loc,
+ name);
+ return 0;
+ }
+
+ inode_ctx_get (loc->inode, this, &tmp_list);
+ list = (int16_t *)(long)tmp_list;
+
+ for (index = 0; list[index] != -1; index++) {
+ if (NS(this) != priv->xl_array[list[index]]) {
+ local->call_count++;
+ count++;
+ }
+ }
+
+ if (count) {
+ for (index = 0; list[index] != -1; index++) {
+ if (priv->xl_array[list[index]] != NS(this)) {
+ STACK_WIND (frame,
+ unify_getxattr_cbk,
+ priv->xl_array[list[index]],
+ priv->xl_array[list[index]]->fops->getxattr,
+ loc,
+ name);
+ if (!--count)
+ break;
+ }
+ }
+ } else {
+ dict_t *tmp_dict = get_new_dict ();
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s: returning ENODATA, no file found on storage node",
+ loc->path);
+ STACK_UNWIND (frame, -1, ENODATA, tmp_dict);
+ dict_destroy (tmp_dict);
+ }
+
+ return 0;
+}
+
+/**
+ * unify_removexattr_cbk - Wait till all the child node returns the call
+ * and then UNWIND to above layer.
+ */
+int32_t
+unify_removexattr_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int32_t callcnt = 0;
+ unify_local_t *local = frame->local;
+ call_frame_t *prev_frame = cookie;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ if (op_errno != ENOTSUP)
+ gf_log (this->name, GF_LOG_ERROR,
+ "child(%s): path(%s): %s",
+ prev_frame->this->name,
+ local->loc1.path, strerror (op_errno));
+ } else {
+ local->op_ret = op_ret;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ STACK_UNWIND (frame, local->op_ret, local->op_errno);
+ }
+
+ return 0;
+}
+
+/**
+ * unify_removexattr - Send it to all the child nodes which has the files.
+ */
+int32_t
+unify_removexattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ const char *name)
+{
+ unify_private_t *priv = this->private;
+ unify_local_t *local = NULL;
+ int16_t *list = NULL;
+ int16_t index = 0;
+ int32_t call_count = 0;
+ uint64_t tmp_list = 0;
+
+ UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+
+ if (S_ISDIR (loc->inode->st_mode)) {
+ local->call_count = priv->child_count;
+ for (index = 0; index < priv->child_count; index++)
+ STACK_WIND (frame,
+ unify_removexattr_cbk,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->removexattr,
+ loc,
+ name);
+
+ return 0;
+ }
+
+ inode_ctx_get (loc->inode, this, &tmp_list);
+ list = (int16_t *)(long)tmp_list;
+
+ for (index = 0; list[index] != -1; index++) {
+ if (NS(this) != priv->xl_array[list[index]]) {
+ local->call_count++;
+ call_count++;
+ }
+ }
+
+ if (local->call_count) {
+ for (index = 0; list[index] != -1; index++) {
+ if (priv->xl_array[list[index]] != NS(this)) {
+ STACK_WIND (frame,
+ unify_removexattr_cbk,
+ priv->xl_array[list[index]],
+ priv->xl_array[list[index]]->fops->removexattr,
+ loc,
+ name);
+ if (!--call_count)
+ break;
+ }
+ }
+ return 0;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s: returning ENOENT, not found on storage node.", loc->path);
+ STACK_UNWIND (frame, -1, ENOENT);
+
+ return 0;
+}
+
+
+int32_t
+unify_mknod_unlink_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ unify_local_t *local = frame->local;
+
+ if (op_ret == -1)
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: %s", local->loc1.path, strerror (op_errno));
+
+ unify_local_wipe (local);
+ /* No log required here as this -1 is for mknod call */
+ STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL);
+ return 0;
+}
+
+/**
+ * unify_mknod_cbk -
+ */
+int32_t
+unify_mknod_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf)
+{
+ unify_local_t *local = frame->local;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "mknod failed on storage node, sending unlink to "
+ "namespace");
+ local->op_errno = op_errno;
+ STACK_WIND (frame,
+ unify_mknod_unlink_cbk,
+ NS(this),
+ NS(this)->fops->unlink,
+ &local->loc1);
+ return 0;
+ }
+
+ local->stbuf = *buf;
+ local->stbuf.st_ino = local->st_ino;
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf);
+ return 0;
+}
+
+/**
+ * unify_ns_mknod_cbk -
+ */
+int32_t
+unify_ns_mknod_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf)
+{
+ struct sched_ops *sched_ops = NULL;
+ xlator_t *sched_xl = NULL;
+ unify_local_t *local = frame->local;
+ unify_private_t *priv = this->private;
+ int16_t *list = NULL;
+ int16_t index = 0;
+ call_frame_t *prev_frame = cookie;
+
+ if (op_ret == -1) {
+ /* No need to send mknod request to other servers,
+ * as namespace action failed
+ */
+ gf_log (this->name, GF_LOG_ERROR,
+ "child(%s): path(%s): %s",
+ prev_frame->this->name, local->loc1.path,
+ strerror (op_errno));
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
+ return 0;
+ }
+
+ /* Create one inode for this entry */
+ local->op_ret = 0;
+ local->stbuf = *buf;
+ local->st_ino = buf->st_ino;
+
+ list = CALLOC (1, sizeof (int16_t) * 3);
+ ERR_ABORT (list);
+ list[0] = priv->child_count;
+ list[2] = -1;
+ inode_ctx_put (inode, this, (uint64_t)(long)list);
+
+ sched_ops = priv->sched_ops;
+
+ /* Send mknod request to scheduled node now */
+ sched_xl = sched_ops->schedule (this, local->loc1.path);
+ if (!sched_xl) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "mknod failed on storage node, no node online "
+ "at the moment, sending unlink to NS");
+ local->op_errno = ENOTCONN;
+ STACK_WIND (frame,
+ unify_mknod_unlink_cbk,
+ NS(this),
+ NS(this)->fops->unlink,
+ &local->loc1);
+
+ return 0;
+ }
+
+ for (index = 0; index < priv->child_count; index++)
+ if (sched_xl == priv->xl_array[index])
+ break;
+ list[1] = index;
+
+ STACK_WIND (frame, unify_mknod_cbk,
+ sched_xl, sched_xl->fops->mknod,
+ &local->loc1, local->mode, local->dev);
+
+ return 0;
+}
+
+/**
+ * unify_mknod - Create a device on namespace first, and later create on
+ * the storage node.
+ */
+int32_t
+unify_mknod (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ mode_t mode,
+ dev_t rdev)
+{
+ unify_local_t *local = NULL;
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+ local->mode = mode;
+ local->dev = rdev;
+ loc_copy (&local->loc1, loc);
+ if (local->loc1.path == NULL) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
+ STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL);
+ return 0;
+ }
+
+ STACK_WIND (frame,
+ unify_ns_mknod_cbk,
+ NS(this),
+ NS(this)->fops->mknod,
+ loc,
+ mode,
+ rdev);
+
+ return 0;
+}
+
+int32_t
+unify_symlink_unlink_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ unify_local_t *local = frame->local;
+ if (op_ret == -1)
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: %s", local->loc1.path, strerror (op_errno));
+
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL);
+ return 0;
+}
+
+/**
+ * unify_symlink_cbk -
+ */
+int32_t
+unify_symlink_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf)
+{
+ unify_local_t *local = frame->local;
+
+ if (op_ret == -1) {
+ /* Symlink on storage node failed, hence send unlink
+ to the NS node */
+ local->op_errno = op_errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "symlink on storage node failed, sending unlink "
+ "to namespace");
+
+ STACK_WIND (frame,
+ unify_symlink_unlink_cbk,
+ NS(this),
+ NS(this)->fops->unlink,
+ &local->loc1);
+
+ return 0;
+ }
+
+ local->stbuf = *buf;
+ local->stbuf.st_ino = local->st_ino;
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf);
+
+ return 0;
+}
+
+/**
+ * unify_ns_symlink_cbk -
+ */
+int32_t
+unify_ns_symlink_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf)
+{
+
+ struct sched_ops *sched_ops = NULL;
+ xlator_t *sched_xl = NULL;
+ int16_t *list = NULL;
+ unify_local_t *local = frame->local;
+ unify_private_t *priv = this->private;
+ int16_t index = 0;
+
+ if (op_ret == -1) {
+ /* No need to send symlink request to other servers,
+ * as namespace action failed
+ */
+ gf_log (this->name, GF_LOG_ERROR,
+ "namespace: path(%s): %s",
+ local->loc1.path, strerror (op_errno));
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, op_ret, op_errno, NULL, buf);
+ return 0;
+ }
+
+ /* Create one inode for this entry */
+ local->op_ret = 0;
+ local->st_ino = buf->st_ino;
+
+ /* Start the mapping list */
+
+ list = CALLOC (1, sizeof (int16_t) * 3);
+ ERR_ABORT (list);
+ list[0] = priv->child_count; //namespace's index
+ list[2] = -1;
+ inode_ctx_put (inode, this, (uint64_t)(long)list);
+
+ sched_ops = priv->sched_ops;
+
+ /* Send symlink request to all the nodes now */
+ sched_xl = sched_ops->schedule (this, local->loc1.path);
+ if (!sched_xl) {
+ /* Symlink on storage node failed, hence send unlink
+ to the NS node */
+ local->op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "symlink on storage node failed, no node online, "
+ "sending unlink to namespace");
+
+ STACK_WIND (frame,
+ unify_symlink_unlink_cbk,
+ NS(this),
+ NS(this)->fops->unlink,
+ &local->loc1);
+
+ return 0;
+ }
+
+ for (index = 0; index < priv->child_count; index++)
+ if (sched_xl == priv->xl_array[index])
+ break;
+ list[1] = index;
+
+ STACK_WIND (frame,
+ unify_symlink_cbk,
+ sched_xl,
+ sched_xl->fops->symlink,
+ local->name,
+ &local->loc1);
+
+ return 0;
+}
+
+/**
+ * unify_symlink -
+ */
+int32_t
+unify_symlink (call_frame_t *frame,
+ xlator_t *this,
+ const char *linkpath,
+ loc_t *loc)
+{
+ unify_local_t *local = NULL;
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+ loc_copy (&local->loc1, loc);
+ local->name = strdup (linkpath);
+
+ if ((local->name == NULL) ||
+ (local->loc1.path == NULL)) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
+ STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL);
+ return 0;
+ }
+
+ STACK_WIND (frame,
+ unify_ns_symlink_cbk,
+ NS(this),
+ NS(this)->fops->symlink,
+ linkpath,
+ loc);
+
+ return 0;
+}
+
+
+int32_t
+unify_rename_unlink_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ int32_t callcnt = 0;
+ unify_local_t *local = frame->local;
+ call_frame_t *prev_frame = cookie;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "child(%s): path(%s -> %s): %s",
+ prev_frame->this->name,
+ local->loc1.path, local->loc2.path,
+ strerror (op_errno));
+
+ }
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ local->stbuf.st_ino = local->st_ino;
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->stbuf);
+ }
+ return 0;
+}
+
+int32_t
+unify_ns_rename_undo_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ unify_local_t *local = frame->local;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "namespace: path(%s -> %s): %s",
+ local->loc1.path, local->loc2.path,
+ strerror (op_errno));
+ }
+
+ local->stbuf.st_ino = local->st_ino;
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno, &local->stbuf);
+ return 0;
+}
+
+int32_t
+unify_rename_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ int32_t index = 0;
+ int32_t callcnt = 0;
+ int16_t *list = NULL;
+ unify_private_t *priv = this->private;
+ unify_local_t *local = frame->local;
+ call_frame_t *prev_frame = cookie;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret >= 0) {
+ if (!S_ISDIR (buf->st_mode))
+ local->stbuf = *buf;
+ local->op_ret = op_ret;
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "child(%s): path(%s -> %s): %s",
+ prev_frame->this->name,
+ local->loc1.path, local->loc2.path,
+ strerror (op_errno));
+ local->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ local->stbuf.st_ino = local->st_ino;
+ if (S_ISDIR (local->loc1.inode->st_mode)) {
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, local->op_ret, local->op_errno, &local->stbuf);
+ return 0;
+ }
+
+ if (local->op_ret == -1) {
+ /* TODO: check this logic */
+
+ /* Rename failed in storage node, successful on NS,
+ * hence, rename back the entries in NS */
+ /* NOTE: this will be done only if the destination
+ * doesn't exists, if the destination exists, the
+ * job of correcting NS is left to self-heal
+ */
+ if (!local->index) {
+ loc_t tmp_oldloc = {
+ /* its actual 'newloc->path' */
+ .path = local->loc2.path,
+ .inode = local->loc1.inode,
+ .parent = local->loc2.parent
+ };
+
+ loc_t tmp_newloc = {
+ /* Actual 'oldloc->path' */
+ .path = local->loc1.path,
+ .parent = local->loc1.parent
+ };
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "rename succussful on namespace, on "
+ "stroage node failed, reverting back");
+
+ STACK_WIND (frame,
+ unify_ns_rename_undo_cbk,
+ NS(this),
+ NS(this)->fops->rename,
+ &tmp_oldloc,
+ &tmp_newloc);
+ return 0;
+ }
+ } else {
+ /* Rename successful on storage nodes */
+
+ int32_t idx = 0;
+ int16_t *tmp_list = NULL;
+ uint64_t tmp_list_int64 = 0;
+ if (local->loc2.inode) {
+ inode_ctx_get (local->loc2.inode,
+ this, &tmp_list_int64);
+ list = (int16_t *)(long)tmp_list_int64;
+
+ }
+
+ if (list) {
+ for (index = 0; list[index] != -1; index++);
+ tmp_list = CALLOC (1, index * 2);
+ memcpy (tmp_list, list, index * 2);
+
+ for (index = 0; list[index] != -1; index++) {
+ /* TODO: Check this logic. */
+ /* If the destination file exists in
+ * the same storage node where we sent
+ * 'rename' call, no need to send
+ * unlink
+ */
+ for (idx = 0;
+ local->list[idx] != -1; idx++) {
+ if (tmp_list[index] == local->list[idx]) {
+ tmp_list[index] = priv->child_count;
+ continue;
+ }
+ }
+
+ if (NS(this) != priv->xl_array[tmp_list[index]]) {
+ local->call_count++;
+ callcnt++;
+ }
+ }
+
+ if (local->call_count) {
+ if (callcnt > 1)
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "%s->%s: more (%d) "
+ "subvolumes have the "
+ "newloc entry",
+ local->loc1.path,
+ local->loc2.path,
+ callcnt);
+
+ for (index=0;
+ tmp_list[index] != -1; index++) {
+ if (NS(this) != priv->xl_array[tmp_list[index]]) {
+ STACK_WIND (frame,
+ unify_rename_unlink_cbk,
+ priv->xl_array[tmp_list[index]],
+ priv->xl_array[tmp_list[index]]->fops->unlink,
+ &local->loc2);
+ if (!--callcnt)
+ break;
+ }
+ }
+
+ FREE (tmp_list);
+ return 0;
+ }
+ if (tmp_list)
+ FREE (tmp_list);
+ }
+ }
+
+ /* Need not send 'unlink' to storage node */
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, local->op_ret,
+ local->op_errno, &local->stbuf);
+ }
+
+ return 0;
+}
+
+int32_t
+unify_ns_rename_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+ int32_t index = 0;
+ int32_t callcnt = 0;
+ int16_t *list = NULL;
+ unify_private_t *priv = this->private;
+ unify_local_t *local = frame->local;
+
+ if (op_ret == -1) {
+ /* Free local->new_inode */
+ gf_log (this->name, GF_LOG_ERROR,
+ "namespace: path(%s -> %s): %s",
+ local->loc1.path, local->loc2.path,
+ strerror (op_errno));
+
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, op_ret, op_errno, buf);
+ return 0;
+ }
+
+ local->stbuf = *buf;
+ local->st_ino = buf->st_ino;
+
+ /* Everything is fine. */
+ if (S_ISDIR (buf->st_mode)) {
+ local->call_count = priv->child_count;
+ for (index=0; index < priv->child_count; index++) {
+ STACK_WIND (frame,
+ unify_rename_cbk,
+ priv->xl_array[index],
+ priv->xl_array[index]->fops->rename,
+ &local->loc1,
+ &local->loc2);
+ }
+
+ return 0;
+ }
+
+ local->call_count = 0;
+ /* send rename */
+ list = local->list;
+ for (index=0; list[index] != -1; index++) {
+ if (NS(this) != priv->xl_array[list[index]]) {
+ local->call_count++;
+ callcnt++;
+ }
+ }
+
+ if (local->call_count) {
+ for (index=0; list[index] != -1; index++) {
+ if (NS(this) != priv->xl_array[list[index]]) {
+ STACK_WIND (frame,
+ unify_rename_cbk,
+ priv->xl_array[list[index]],
+ priv->xl_array[list[index]]->fops->rename,
+ &local->loc1,
+ &local->loc2);
+ if (!--callcnt)
+ break;
+ }
+ }
+ } else {
+ /* file doesn't seem to be present in storage nodes */
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "CRITICAL: source file not in storage node, "
+ "rename successful on namespace :O");
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, -1, EIO, NULL);
+ }
+ return 0;
+}
+
+
+/**
+ * unify_rename - One of the tricky function. The deadliest of all :O
+ */
+int32_t
+unify_rename (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *oldloc,
+ loc_t *newloc)
+{
+ unify_local_t *local = NULL;
+ uint64_t tmp_list = 0;
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+ loc_copy (&local->loc1, oldloc);
+ loc_copy (&local->loc2, newloc);
+
+ if ((local->loc1.path == NULL) ||
+ (local->loc2.path == NULL)) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
+ STACK_UNWIND (frame, -1, ENOMEM, NULL);
+ return 0;
+ }
+
+ inode_ctx_get (oldloc->inode, this, &tmp_list);
+ local->list = (int16_t *)(long)tmp_list;
+
+ STACK_WIND (frame,
+ unify_ns_rename_cbk,
+ NS(this),
+ NS(this)->fops->rename,
+ oldloc,
+ newloc);
+ return 0;
+}
+
+/**
+ * unify_link_cbk -
+ */
+int32_t
+unify_link_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf)
+{
+ unify_local_t *local = frame->local;
+
+ if (op_ret >= 0)
+ local->stbuf = *buf;
+ local->stbuf.st_ino = local->st_ino;
+
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf);
+
+ return 0;
+}
+
+/**
+ * unify_ns_link_cbk -
+ */
+int32_t
+unify_ns_link_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *buf)
+{
+ unify_private_t *priv = this->private;
+ unify_local_t *local = frame->local;
+ int16_t *list = local->list;
+ int16_t index = 0;
+
+ if (op_ret == -1) {
+ /* No need to send link request to other servers,
+ * as namespace action failed
+ */
+ gf_log (this->name, GF_LOG_ERROR,
+ "namespace: path(%s -> %s): %s",
+ local->loc1.path, local->loc2.path,
+ strerror (op_errno));
+ unify_local_wipe (local);
+ STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
+ return 0;
+ }
+
+ /* Update inode for this entry */
+ local->op_ret = 0;
+ local->st_ino = buf->st_ino;
+
+ /* Send link request to the node now */
+ for (index = 0; list[index] != -1; index++) {
+ char need_break = (list[index+1] == -1);
+ if (priv->xl_array[list[index]] != NS (this)) {
+ STACK_WIND (frame,
+ unify_link_cbk,
+ priv->xl_array[list[index]],
+ priv->xl_array[list[index]]->fops->link,
+ &local->loc1,
+ &local->loc2);
+ }
+ if (need_break)
+ break;
+ }
+
+ return 0;
+}
+
+/**
+ * unify_link -
+ */
+int32_t
+unify_link (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *oldloc,
+ loc_t *newloc)
+{
+ unify_local_t *local = NULL;
+ uint64_t tmp_list = 0;
+
+ UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc);
+ UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (newloc);
+
+ /* Initialization */
+ INIT_LOCAL (frame, local);
+
+ loc_copy (&local->loc1, oldloc);
+ loc_copy (&local->loc2, newloc);
+
+ inode_ctx_get (oldloc->inode, this, &tmp_list);
+ local->list = (int16_t *)(long)tmp_list;
+
+ STACK_WIND (frame,
+ unify_ns_link_cbk,
+ NS(this),
+ NS(this)->fops->link,
+ oldloc,
+ newloc);
+
+ return 0;
+}
+
+
+/**
+ * unify_checksum_cbk -
+ */
+int32_t
+unify_checksum_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ uint8_t *fchecksum,
+ uint8_t *dchecksum)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, fchecksum, dchecksum);
+
+ return 0;
+}
+
+/**
+ * unify_checksum -
+ */
+int32_t
+unify_checksum (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flag)
+{
+ STACK_WIND (frame,
+ unify_checksum_cbk,
+ NS(this),
+ NS(this)->fops->checksum,
+ loc,
+ flag);
+
+ return 0;
+}
+
+
+/**
+ * unify_finodelk_cbk -
+ */
+int
+unify_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+/**
+ * unify_finodelk
+ */
+int
+unify_finodelk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int cmd, struct flock *flock)
+{
+ UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
+ xlator_t *child = NULL;
+ uint64_t tmp_child = 0;
+
+ fd_ctx_get (fd, this, &tmp_child);
+ child = (xlator_t *)(long)tmp_child;
+
+ STACK_WIND (frame, unify_finodelk_cbk,
+ child, child->fops->finodelk,
+ fd, cmd, flock);
+
+ return 0;
+}
+
+
+
+/**
+ * unify_fentrylk_cbk -
+ */
+int
+unify_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+/**
+ * unify_fentrylk
+ */
+int
+unify_fentrylk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *basename,
+ entrylk_cmd cmd, entrylk_type type)
+
+{
+ UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
+ xlator_t *child = NULL;
+ uint64_t tmp_child = 0;
+
+ fd_ctx_get (fd, this, &tmp_child);
+ child = (xlator_t *)(long)tmp_child;
+
+ STACK_WIND (frame, unify_fentrylk_cbk,
+ child, child->fops->fentrylk,
+ fd, basename, cmd, type);
+
+ return 0;
+}
+
+
+
+/**
+ * unify_fxattrop_cbk -
+ */
+int
+unify_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, xattr);
+ return 0;
+}
+
+/**
+ * unify_fxattrop
+ */
+int
+unify_fxattrop (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr)
+{
+ UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
+ xlator_t *child = NULL;
+ uint64_t tmp_child = 0;
+
+ fd_ctx_get (fd, this, &tmp_child);
+ child = (xlator_t *)(long)tmp_child;
+
+ STACK_WIND (frame, unify_fxattrop_cbk,
+ child, child->fops->fxattrop,
+ fd, optype, xattr);
+
+ return 0;
+}
+
+
+/**
+ * unify_inodelk_cbk -
+ */
+int
+unify_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+
+/**
+ * unify_inodelk
+ */
+int
+unify_inodelk (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int cmd, struct flock *flock)
+{
+ xlator_t *child = NULL;
+
+ child = unify_loc_subvol (loc, this);
+
+ STACK_WIND (frame, unify_inodelk_cbk,
+ child, child->fops->inodelk,
+ loc, cmd, flock);
+
+ return 0;
+}
+
+
+
+/**
+ * unify_entrylk_cbk -
+ */
+int
+unify_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+/**
+ * unify_entrylk
+ */
+int
+unify_entrylk (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *basename,
+ entrylk_cmd cmd, entrylk_type type)
+
+{
+ xlator_t *child = NULL;
+
+ child = unify_loc_subvol (loc, this);
+
+ STACK_WIND (frame, unify_entrylk_cbk,
+ child, child->fops->entrylk,
+ loc, basename, cmd, type);
+
+ return 0;
+}
+
+
+
+/**
+ * unify_xattrop_cbk -
+ */
+int
+unify_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, xattr);
+ return 0;
+}
+
+/**
+ * unify_xattrop
+ */
+int
+unify_xattrop (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr)
+{
+ xlator_t *child = NULL;
+
+ child = unify_loc_subvol (loc, this);
+
+ STACK_WIND (frame, unify_xattrop_cbk,
+ child, child->fops->xattrop,
+ loc, optype, xattr);
+
+ return 0;
+}
+
+
+/**
+ * notify
+ */
+int32_t
+notify (xlator_t *this,
+ int32_t event,
+ void *data,
+ ...)
+{
+ unify_private_t *priv = this->private;
+ struct sched_ops *sched = NULL;
+
+ if (!priv) {
+ return 0;
+ }
+
+ sched = priv->sched_ops;
+ if (!sched) {
+ gf_log (this->name, GF_LOG_CRITICAL, "No scheduler :O");
+ raise (SIGTERM);
+ return 0;
+ }
+ if (priv->namespace == data) {
+ if (event == GF_EVENT_CHILD_UP) {
+ sched->notify (this, event, data);
+ }
+ return 0;
+ }
+
+ switch (event)
+ {
+ case GF_EVENT_CHILD_UP:
+ {
+ /* Call scheduler's update () to enable it for scheduling */
+ sched->notify (this, event, data);
+
+ LOCK (&priv->lock);
+ {
+ /* Increment the inode's generation, which is
+ used for self_heal */
+ ++priv->inode_generation;
+ ++priv->num_child_up;
+ }
+ UNLOCK (&priv->lock);
+
+ if (!priv->is_up) {
+ default_notify (this, event, data);
+ priv->is_up = 1;
+ }
+ }
+ break;
+ case GF_EVENT_CHILD_DOWN:
+ {
+ /* Call scheduler's update () to disable the child node
+ * for scheduling
+ */
+ sched->notify (this, event, data);
+ LOCK (&priv->lock);
+ {
+ --priv->num_child_up;
+ }
+ UNLOCK (&priv->lock);
+
+ if (priv->num_child_up == 0) {
+ /* Send CHILD_DOWN to upper layer */
+ default_notify (this, event, data);
+ priv->is_up = 0;
+ }
+ }
+ break;
+
+ default:
+ {
+ default_notify (this, event, data);
+ }
+ break;
+ }
+
+ return 0;
+}
+
+/**
+ * init - This function is called first in the xlator, while initializing.
+ * All the config file options are checked and appropriate flags are set.
+ *
+ * @this -
+ */
+int32_t
+init (xlator_t *this)
+{
+ int32_t ret = 0;
+ int32_t count = 0;
+ data_t *scheduler = NULL;
+ data_t *data = NULL;
+ xlator_t *ns_xl = NULL;
+ xlator_list_t *trav = NULL;
+ xlator_list_t *xlparent = NULL;
+ xlator_list_t *parent = NULL;
+ unify_private_t *_private = NULL;
+
+ /* Check for number of child nodes, if there is no child nodes, exit */
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "No child nodes specified. check \"subvolumes \" "
+ "option in volfile");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ /* Check for 'scheduler' in volume */
+ scheduler = dict_get (this->options, "scheduler");
+ if (!scheduler) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "\"option scheduler <x>\" is missing in volfile");
+ return -1;
+ }
+
+ /* Setting "option namespace <node>" */
+ data = dict_get (this->options, "namespace");
+ if(!data) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "namespace option not specified, Exiting");
+ return -1;
+ }
+ /* Search namespace in the child node, if found, exit */
+ trav = this->children;
+ while (trav) {
+ if (strcmp (trav->xlator->name, data->data) == 0)
+ break;
+ trav = trav->next;
+ }
+ if (trav) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "namespace node used as a subvolume, Exiting");
+ return -1;
+ }
+
+ /* Search for the namespace node, if found, continue */
+ ns_xl = this->next;
+ while (ns_xl) {
+ if (strcmp (ns_xl->name, data->data) == 0)
+ break;
+ ns_xl = ns_xl->next;
+ }
+ if (!ns_xl) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "namespace node not found in volfile, Exiting");
+ return -1;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "namespace node specified as %s", data->data);
+
+ _private = CALLOC (1, sizeof (*_private));
+ ERR_ABORT (_private);
+ _private->sched_ops = get_scheduler (this, scheduler->data);
+ if (!_private->sched_ops) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Error while loading scheduler. Exiting");
+ FREE (_private);
+ return -1;
+ }
+
+ if (ns_xl->parents) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Namespace node should not be a child of any other node. Exiting");
+ FREE (_private);
+ return -1;
+ }
+
+ _private->namespace = ns_xl;
+
+ /* update _private structure */
+ {
+ count = 0;
+ trav = this->children;
+ /* Get the number of child count */
+ while (trav) {
+ count++;
+ trav = trav->next;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Child node count is %d", count);
+
+ _private->child_count = count;
+ if (count == 1) {
+ /* TODO: Should I error out here? */
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "WARNING: You have defined only one "
+ "\"subvolumes\" for unify volume. It may not "
+ "be the desired config, review your volume "
+ "volfile. If this is how you are testing it,"
+ " you may hit some performance penalty");
+ }
+
+ _private->xl_array = CALLOC (1,
+ sizeof (xlator_t) * (count + 1));
+ ERR_ABORT (_private->xl_array);
+
+ count = 0;
+ trav = this->children;
+ while (trav) {
+ _private->xl_array[count++] = trav->xlator;
+ trav = trav->next;
+ }
+ _private->xl_array[count] = _private->namespace;
+
+ /* self-heal part, start with generation '1' */
+ _private->inode_generation = 1;
+ /* Because, Foreground part is tested well */
+ _private->self_heal = ZR_UNIFY_FG_SELF_HEAL;
+ data = dict_get (this->options, "self-heal");
+ if (data) {
+ if (strcasecmp (data->data, "off") == 0)
+ _private->self_heal = ZR_UNIFY_SELF_HEAL_OFF;
+
+ if (strcasecmp (data->data, "foreground") == 0)
+ _private->self_heal = ZR_UNIFY_FG_SELF_HEAL;
+
+ if (strcasecmp (data->data, "background") == 0)
+ _private->self_heal = ZR_UNIFY_BG_SELF_HEAL;
+ }
+
+ /* optimist - ask bulde for more about it */
+ data = dict_get (this->options, "optimist");
+ if (data) {
+ if (gf_string2boolean (data->data,
+ &_private->optimist) == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "optimist excepts only boolean "
+ "options");
+ }
+ }
+
+ LOCK_INIT (&_private->lock);
+ }
+
+ /* Now that everything is fine. */
+ this->private = (void *)_private;
+ {
+ /* Initialize scheduler, if everything else is successful */
+ ret = _private->sched_ops->init (this);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Initializing scheduler failed, Exiting");
+ FREE (_private);
+ return -1;
+ }
+
+ ret = 0;
+
+ /* This section is required because some fops may look
+ * for 'xl->parent' variable
+ */
+ xlparent = CALLOC (1, sizeof (*xlparent));
+ xlparent->xlator = this;
+ if (!ns_xl->parents) {
+ ns_xl->parents = xlparent;
+ } else {
+ parent = ns_xl->parents;
+ while (parent->next)
+ parent = parent->next;
+ parent->next = xlparent;
+ }
+ /* Initialize the namespace volume */
+ if (!ns_xl->ready) {
+ ret = xlator_tree_init (ns_xl);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "initializing namespace node failed, "
+ "Exiting");
+ FREE (_private);
+ return -1;
+ }
+ }
+ }
+
+ /* Tell namespace node that init is done */
+ ns_xl->notify (ns_xl, GF_EVENT_PARENT_UP, this);
+
+ return 0;
+}
+
+/**
+ * fini - Free all the allocated memory
+ */
+void
+fini (xlator_t *this)
+{
+ unify_private_t *priv = this->private;
+ priv->sched_ops->fini (this);
+ this->private = NULL;
+ LOCK_DESTROY (&priv->lock);
+ FREE (priv->xl_array);
+ FREE (priv);
+ return;
+}
+
+
+struct xlator_fops fops = {
+ .stat = unify_stat,
+ .chmod = unify_chmod,
+ .readlink = unify_readlink,
+ .mknod = unify_mknod,
+ .mkdir = unify_mkdir,
+ .unlink = unify_unlink,
+ .rmdir = unify_rmdir,
+ .symlink = unify_symlink,
+ .rename = unify_rename,
+ .link = unify_link,
+ .chown = unify_chown,
+ .truncate = unify_truncate,
+ .create = unify_create,
+ .open = unify_open,
+ .readv = unify_readv,
+ .writev = unify_writev,
+ .statfs = unify_statfs,
+ .flush = unify_flush,
+ .fsync = unify_fsync,
+ .setxattr = unify_setxattr,
+ .getxattr = unify_getxattr,
+ .removexattr = unify_removexattr,
+ .opendir = unify_opendir,
+ .readdir = unify_readdir,
+ .fsyncdir = unify_fsyncdir,
+ .access = unify_access,
+ .ftruncate = unify_ftruncate,
+ .fstat = unify_fstat,
+ .lk = unify_lk,
+ .fchown = unify_fchown,
+ .fchmod = unify_fchmod,
+ .utimens = unify_utimens,
+ .lookup = unify_lookup,
+ .getdents = unify_getdents,
+ .checksum = unify_checksum,
+ .inodelk = unify_inodelk,
+ .finodelk = unify_finodelk,
+ .entrylk = unify_entrylk,
+ .fentrylk = unify_fentrylk,
+ .xattrop = unify_xattrop,
+ .fxattrop = unify_fxattrop
+};
+
+struct xlator_mops mops = {
+};
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = { "namespace" },
+ .type = GF_OPTION_TYPE_XLATOR
+ },
+ { .key = { "scheduler" },
+ .value = { "alu", "rr", "random", "nufa", "switch" },
+ .type = GF_OPTION_TYPE_STR
+ },
+ { .key = {"self-heal"},
+ .value = { "foreground", "background", "off" },
+ .type = GF_OPTION_TYPE_STR
+ },
+ /* TODO: remove it some time later */
+ { .key = {"optimist"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/unify/src/unify.h b/xlators/cluster/unify/src/unify.h
new file mode 100644
index 000000000..bc18dc53f
--- /dev/null
+++ b/xlators/cluster/unify/src/unify.h
@@ -0,0 +1,132 @@
+/*
+ Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef _UNIFY_H
+#define _UNIFY_H
+
+#include "scheduler.h"
+#include "list.h"
+
+#define MAX_DIR_ENTRY_STRING (32 * 1024)
+
+#define ZR_UNIFY_SELF_HEAL_OFF 0
+#define ZR_UNIFY_FG_SELF_HEAL 1
+#define ZR_UNIFY_BG_SELF_HEAL 2
+
+/* Sometimes one should use completely random numbers.. its good :p */
+#define UNIFY_SELF_HEAL_GETDENTS_COUNT 1024
+
+#define NS(xl) (((unify_private_t *)xl->private)->namespace)
+
+/* This is used to allocate memory for local structure */
+#define INIT_LOCAL(fr, loc) \
+do { \
+ loc = CALLOC (1, sizeof (unify_local_t)); \
+ ERR_ABORT (loc); \
+ if (!loc) { \
+ STACK_UNWIND (fr, -1, ENOMEM); \
+ return 0; \
+ } \
+ fr->local = loc; \
+ loc->op_ret = -1; \
+ loc->op_errno = ENOENT; \
+} while (0)
+
+
+
+struct unify_private {
+ /* Update this structure depending on requirement */
+ void *scheduler; /* THIS SHOULD BE THE FIRST VARIABLE,
+ if xlator is using scheduler */
+ struct sched_ops *sched_ops; /* Scheduler options */
+ xlator_t *namespace; /* ptr to namespace xlator */
+ xlator_t **xl_array;
+ gf_boolean_t optimist;
+ int16_t child_count;
+ int16_t num_child_up;
+ uint8_t self_heal;
+ uint8_t is_up;
+ uint64_t inode_generation;
+ gf_lock_t lock;
+};
+typedef struct unify_private unify_private_t;
+
+struct unify_self_heal_struct {
+ uint8_t dir_checksum[ZR_FILENAME_MAX];
+ uint8_t ns_dir_checksum[ZR_FILENAME_MAX];
+ uint8_t file_checksum[ZR_FILENAME_MAX];
+ uint8_t ns_file_checksum[ZR_FILENAME_MAX];
+ off_t *offset_list;
+ int *count_list;
+ dir_entry_t **entry_list;
+};
+
+
+struct _unify_local_t {
+ int32_t call_count;
+ int32_t op_ret;
+ int32_t op_errno;
+ mode_t mode;
+ off_t offset;
+ dev_t dev;
+ uid_t uid;
+ gid_t gid;
+ int32_t flags;
+ int32_t entry_count;
+ int32_t count; // dir_entry_t count;
+ fd_t *fd;
+ struct stat stbuf;
+ struct statvfs statvfs_buf;
+ struct timespec tv[2];
+ char *name;
+ int32_t revalidate;
+
+ ino_t st_ino;
+ nlink_t st_nlink;
+
+ dict_t *dict;
+
+ int16_t *list;
+ int16_t *new_list; /* Used only in case of rename */
+ int16_t index;
+
+ int32_t failed;
+ int32_t return_eio; /* Used in case of different st-mode
+ present for a given path */
+
+ uint64_t inode_generation; /* used to store the per directory
+ * inode_generation. Got from inode's ctx
+ * of directory inodes
+ */
+
+ struct unify_self_heal_struct *sh_struct;
+ loc_t loc1, loc2;
+};
+typedef struct _unify_local_t unify_local_t;
+
+int32_t zr_unify_self_heal (call_frame_t *frame,
+ xlator_t *this,
+ unify_local_t *local);
+
+#endif /* _UNIFY_H */