summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr
diff options
context:
space:
mode:
authorVikas Gorur <vikas@zresearch.com>2009-02-18 17:36:07 +0530
committerVikas Gorur <vikas@zresearch.com>2009-02-18 17:36:07 +0530
commit77adf4cd648dce41f89469dd185deec6b6b53a0b (patch)
tree02e155a5753b398ee572b45793f889b538efab6b /xlators/cluster/afr
parentf3b2e6580e5663292ee113c741343c8a43ee133f (diff)
Added all files
Diffstat (limited to 'xlators/cluster/afr')
-rw-r--r--xlators/cluster/afr/Makefile.am3
-rw-r--r--xlators/cluster/afr/src/Makefile.am20
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c345
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.h47
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.c1786
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.h59
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.c721
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.h47
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c2024
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.h63
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c1073
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.h66
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c1030
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c2038
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c791
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h52
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c957
-rw-r--r--xlators/cluster/afr/src/afr-transaction.h36
-rw-r--r--xlators/cluster/afr/src/afr.c2338
-rw-r--r--xlators/cluster/afr/src/afr.h523
20 files changed, 14019 insertions, 0 deletions
diff --git a/xlators/cluster/afr/Makefile.am b/xlators/cluster/afr/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/cluster/afr/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am
new file mode 100644
index 000000000..1bde9e5ba
--- /dev/null
+++ b/xlators/cluster/afr/src/Makefile.am
@@ -0,0 +1,20 @@
+xlator_LTLIBRARIES = afr.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+afr_la_LDFLAGS = -module -avoidversion
+
+afr_la_SOURCES = afr.c afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c
+afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES =
+
+uninstall-local:
+ rm -f $(DESTDIR)$(xlatordir)/replicate.so
+
+install-data-hook:
+ ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so \ No newline at end of file
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
new file mode 100644
index 000000000..0c65ca852
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -0,0 +1,345 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+
+#include "afr.h"
+
+
+int32_t
+afr_opendir_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ fd_t *fd)
+{
+ afr_local_t * local = NULL;
+
+ int call_count = -1;
+
+ LOCK (&frame->lock);
+ {
+ local = frame->local;
+
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ AFR_STACK_UNWIND (frame, local->op_ret,
+ local->op_errno, local->fd);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_opendir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int child_count = 0;
+ int i = 0;
+
+ int ret = -1;
+ int call_count = -1;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ child_count = priv->child_count;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+ local->fd = fd_ref (fd);
+
+ call_count = local->call_count;
+
+ for (i = 0; i < child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_opendir_cbk,
+ priv->children[i],
+ priv->children[i]->fops->opendir,
+ loc, fd);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, fd);
+ }
+
+ return 0;
+}
+
+
+/**
+ * Common algorithm for directory read calls:
+ *
+ * - Try the fop on the first child that is up
+ * - if we have failed due to ENOTCONN:
+ * try the next child
+ *
+ * Applicable to: readdir
+ */
+
+int32_t
+afr_readdir_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ gf_dirent_t *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ last_tried = local->cont.readdir.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+
+ this_try = ++local->cont.readdir.last_tried;
+ unwind = 0;
+
+ STACK_WIND (frame, afr_readdir_cbk,
+ children[this_try],
+ children[this_try]->fops->readdir,
+ local->fd, local->cont.readdir.size,
+ local->cont.readdir.offset);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_readdir (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset)
+{
+ afr_private_t * priv = NULL;
+ xlator_t ** children = NULL;
+ int call_child = 0;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.readdir.last_tried = call_child;
+
+ local->fd = fd_ref (fd);
+ local->cont.readdir.size = size;
+ local->cont.readdir.offset = offset;
+
+ STACK_WIND (frame, afr_readdir_cbk,
+ children[call_child], children[call_child]->fops->readdir,
+ fd, size, offset);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_getdents_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dir_entry_t *entry, int32_t count)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ last_tried = local->cont.getdents.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+
+ this_try = ++local->cont.getdents.last_tried;
+ unwind = 0;
+
+ STACK_WIND (frame, afr_getdents_cbk,
+ children[this_try],
+ children[this_try]->fops->getdents,
+ local->fd, local->cont.getdents.size,
+ local->cont.getdents.offset, local->cont.getdents.flag);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, entry, count);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_getdents (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset, int32_t flag)
+{
+ afr_private_t * priv = NULL;
+ xlator_t ** children = NULL;
+ int call_child = 0;
+ afr_local_t *local = NULL;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.getdents.last_tried = call_child;
+
+ local->fd = fd_ref (fd);
+
+ local->cont.getdents.size = size;
+ local->cont.getdents.offset = offset;
+ local->cont.getdents.flag = flag;
+
+ frame->local = local;
+
+ STACK_WIND (frame, afr_getdents_cbk,
+ children[call_child], children[call_child]->fops->getdents,
+ fd, size, offset, flag);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+
diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h
new file mode 100644
index 000000000..172ec3c90
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-dir-read.h
@@ -0,0 +1,47 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __DIR_READ_H__
+#define __DIR_READ_H__
+
+
+int32_t
+afr_opendir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, fd_t *fd);
+
+int32_t
+afr_closedir (call_frame_t *frame, xlator_t *this,
+ fd_t *fd);
+
+int32_t
+afr_readdir (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset);
+
+
+int32_t
+afr_getdents (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset, int32_t flag);
+
+
+int32_t
+afr_checksum (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags);
+
+
+#endif /* __DIR_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
new file mode 100644
index 000000000..87a6e09b5
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -0,0 +1,1786 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+
+#include "afr.h"
+#include "afr-transaction.h"
+
+
+void
+afr_build_parent_loc (loc_t *parent, loc_t *child)
+{
+ char *tmp = NULL;
+
+ if (!child->parent) {
+ loc_copy (parent, child);
+ return;
+ }
+
+ tmp = strdup (child->path);
+ parent->path = strdup (dirname (tmp));
+ FREE (tmp);
+
+ parent->name = strrchr (parent->path, '/');
+ if (parent->name)
+ parent->name++;
+
+ parent->inode = inode_ref (child->parent);
+ parent->parent = inode_parent (parent->inode, 0, NULL);
+ parent->ino = parent->inode->ino;
+}
+
+
+/* {{{ create */
+
+int
+afr_create_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ local->cont.create.fd,
+ local->cont.create.inode,
+ &local->cont.create.buf);
+ return 0;
+}
+
+
+int
+afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ fd_t *fd, inode_t *inode, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+
+ if ((local->success_count == 0)
+ || (child_index == priv->read_child)) {
+ local->cont.create.buf = *buf;
+ local->cont.create.buf.st_ino =
+ afr_itransform (buf->st_ino,
+ priv->child_count,
+ child_index);
+ }
+ local->cont.create.inode = inode;
+
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_create_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_create_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->create,
+ &local->loc,
+ local->cont.create.flags,
+ local->cont.create.mode,
+ local->cont.create.fd);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_create_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode, fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->cont.create.flags = flags;
+ local->cont.create.mode = mode;
+ local->cont.create.fd = fd_ref (fd);
+
+ local->transaction.fop = afr_create_wind;
+ local->transaction.done = afr_create_done;
+ local->transaction.unwind = afr_create_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ mknod */
+
+int
+afr_mknod_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ local->cont.mknod.inode,
+ &local->cont.mknod.buf);
+ return 0;
+}
+
+
+int
+afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+
+ if ((local->success_count == 0)
+ || (child_index == priv->read_child)) {
+ local->cont.mknod.buf = *buf;
+ local->cont.mknod.buf.st_ino =
+ afr_itransform (buf->st_ino,
+ priv->child_count,
+ child_index);
+ }
+ local->cont.mknod.inode = inode;
+
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_mknod_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->mknod,
+ &local->loc, local->cont.mknod.mode,
+ local->cont.mknod.dev);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_mknod_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_mknod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, dev_t dev)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->cont.mknod.mode = mode;
+ local->cont.mknod.dev = dev;
+
+ local->transaction.fop = afr_mknod_wind;
+ local->transaction.done = afr_mknod_done;
+ local->transaction.unwind = afr_mknod_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ mkdir */
+
+
+int
+afr_mkdir_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ local->cont.mkdir.inode,
+ &local->cont.mkdir.buf);
+ return 0;
+}
+
+
+int
+afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+
+ if ((local->success_count == 0)
+ || (child_index == priv->read_child)) {
+ local->cont.mkdir.buf = *buf;
+ local->cont.mkdir.buf.st_ino =
+ afr_itransform (buf->st_ino, priv->child_count,
+ child_index);
+ }
+ local->cont.mkdir.inode = inode;
+
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_mkdir_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->mkdir,
+ &local->loc, local->cont.mkdir.mode);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_mkdir_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->cont.mkdir.mode = mode;
+
+ local->transaction.fop = afr_mkdir_wind;
+ local->transaction.done = afr_mkdir_done;
+ local->transaction.unwind = afr_mkdir_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ link */
+
+
+int
+afr_link_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.link.buf.st_ino = local->cont.link.ino;
+
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ local->cont.link.inode,
+ &local->cont.link.buf);
+ }
+
+ return 0;
+}
+
+
+int
+afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+
+ if ((local->success_count == 0)
+ || (child_index == priv->read_child)) {
+ local->cont.link.buf = *buf;
+ local->cont.link.buf.st_ino =
+ afr_itransform (buf->st_ino, priv->child_count,
+ child_index);
+ }
+ local->cont.link.inode = inode;
+
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_link_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->link,
+ &local->loc,
+ &local->newloc);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_link_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, oldloc);
+ loc_copy (&local->newloc, newloc);
+
+ local->cont.link.ino = oldloc->inode->ino;
+
+ local->transaction.fop = afr_link_wind;
+ local->transaction.done = afr_link_done;
+ local->transaction.unwind = afr_link_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, oldloc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (oldloc->path);
+ local->transaction.new_basename = AFR_BASENAME (newloc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ symlink */
+
+
+int
+afr_symlink_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ local->cont.symlink.inode,
+ &local->cont.symlink.buf);
+ return 0;
+}
+
+
+int
+afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+
+ if ((local->success_count == 0)
+ || (child_index == priv->read_child)) {
+ local->cont.symlink.buf = *buf;
+ local->cont.symlink.buf.st_ino =
+ afr_itransform (buf->st_ino, priv->child_count,
+ child_index);
+ }
+ local->cont.symlink.inode = inode;
+
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_symlink_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->symlink,
+ local->cont.symlink.linkpath,
+ &local->loc);
+
+ if (!--call_count)
+ break;
+
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_symlink_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkpath, loc_t *loc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->cont.symlink.ino = loc->inode->ino;
+ local->cont.symlink.linkpath = strdup (linkpath);
+
+ local->transaction.fop = afr_symlink_wind;
+ local->transaction.done = afr_symlink_done;
+ local->transaction.unwind = afr_symlink_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ rename */
+
+int
+afr_rename_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.rename.buf.st_ino = local->cont.rename.ino;
+
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.rename.buf);
+ }
+
+ return 0;
+}
+
+
+int
+afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if ((op_ret != -1) && (local->success_count == 0)) {
+ local->op_ret = op_ret;
+
+ if (buf) {
+ local->cont.rename.buf = *buf;
+ local->cont.rename.buf.st_ino =
+ afr_itransform (buf->st_ino, priv->child_count,
+ child_index);
+ }
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_rename_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_rename_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->rename,
+ &local->loc,
+ &local->newloc);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_rename_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, oldloc);
+ loc_copy (&local->newloc, newloc);
+
+ local->cont.rename.ino = oldloc->inode->ino;
+
+ local->transaction.fop = afr_rename_wind;
+ local->transaction.done = afr_rename_done;
+ local->transaction.unwind = afr_rename_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, oldloc);
+ afr_build_parent_loc (&local->transaction.new_parent_loc, newloc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (oldloc->path);
+ local->transaction.new_basename = AFR_BASENAME (newloc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ unlink */
+
+int
+afr_unlink_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_unlink_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->unlink,
+ &local->loc);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_unlink_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int32_t
+afr_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.fop = afr_unlink_wind;
+ local->transaction.done = afr_unlink_done;
+ local->transaction.unwind = afr_unlink_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ rmdir */
+
+
+
+int
+afr_rmdir_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame) {
+ main_frame = local->transaction.main_frame;
+ }
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame)
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count)
+ need_unwind = 1;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_rmdir_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->rmdir,
+ &local->loc);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_rmdir_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_rmdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.fop = afr_rmdir_wind;
+ local->transaction.done = afr_rmdir_done;
+ local->transaction.unwind = afr_rmdir_unwind;
+
+ afr_build_parent_loc (&local->transaction.parent_loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ setdents */
+
+int32_t
+afr_setdents_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if ((op_ret != -1) && (local->success_count == 0)) {
+ local->op_ret = op_ret;
+ local->success_count++;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_setdents_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_setdents_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->setdents,
+ local->fd, local->cont.setdents.flags,
+ local->cont.setdents.entries,
+ local->cont.setdents.count);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_setdents_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_setdents (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+
+ local->fd = fd_ref (fd);
+
+ local->cont.setdents.flags = flags;
+ local->cont.setdents.entries = entries;
+ local->cont.setdents.count = count;
+
+ local->transaction.fop = afr_setdents_wind;
+ local->transaction.done = afr_setdents_done;
+
+ local->transaction.basename = NULL;
+ local->transaction.pending = AFR_ENTRY_PENDING;
+
+ afr_transaction (frame, this, AFR_ENTRY_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+/* }}} */
diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h
new file mode 100644
index 000000000..e6e8a5e79
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-dir-write.h
@@ -0,0 +1,59 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __DIR_WRITE_H__
+#define __DIR_WRITE_H__
+
+int32_t
+afr_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode, fd_t *fd);
+
+int32_t
+afr_mknod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, dev_t dev);
+
+int32_t
+afr_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode);
+
+int32_t
+afr_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc);
+
+int32_t
+afr_rmdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc);
+
+int32_t
+afr_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc);
+
+int32_t
+afr_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc);
+
+int32_t
+afr_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkpath, loc_t *oldloc);
+
+int32_t
+afr_setdents (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count);
+
+#endif /* __DIR_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
new file mode 100644
index 000000000..a6c99ec05
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -0,0 +1,721 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+
+#include "afr.h"
+
+
+/**
+ * Common algorithm for inode read calls:
+ *
+ * - Try the fop on the first child that is up
+ * - if we have failed due to ENOTCONN:
+ * try the next child
+ *
+ * Applicable to: access, stat, fstat, readlink, getxattr
+ */
+
+/* {{{ access */
+
+int32_t
+afr_access_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ last_tried = local->cont.access.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.access.last_tried;
+
+ unwind = 0;
+
+ STACK_WIND_COOKIE (frame, afr_access_cbk,
+ (void *) (long) this_try,
+ children[this_try],
+ children[this_try]->fops->access,
+ &local->loc, local->cont.access.mask);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_access (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t mask)
+{
+ afr_private_t * priv = NULL;
+ xlator_t ** children = NULL;
+ int call_child = 0;
+ afr_local_t *local = NULL;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.access.last_tried = call_child;
+ loc_copy (&local->loc, loc);
+ local->cont.access.mask = mask;
+
+ STACK_WIND_COOKIE (frame, afr_access_cbk,
+ (void *) (long) call_child,
+ children[call_child], children[call_child]->fops->access,
+ loc, mask);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ stat */
+
+int32_t
+afr_stat_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int deitransform_child = -1;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ deitransform_child = (long) cookie;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ retry:
+ last_tried = local->cont.stat.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.stat.last_tried;
+
+ if (this_try == deitransform_child) {
+ goto retry;
+ }
+
+ unwind = 0;
+
+ STACK_WIND_COOKIE (frame, afr_stat_cbk,
+ (void *) (long) deitransform_child,
+ children[this_try],
+ children[this_try]->fops->stat,
+ &local->loc);
+ }
+
+out:
+ if (unwind) {
+ if (op_ret != -1)
+ buf->st_ino = local->cont.stat.ino;
+
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_stat (call_frame_t *frame, xlator_t *this,
+ loc_t *loc)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int call_child = 0;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ frame->local = local;
+
+ call_child = afr_deitransform (loc->inode->ino, priv->child_count);
+ loc_copy (&local->loc, loc);
+
+ /*
+ if stat fails from the deitranform'd child, we try
+ all children starting with the first one
+ */
+ local->cont.stat.last_tried = -1;
+ local->cont.stat.ino = loc->inode->ino;
+
+ STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child,
+ children[call_child],
+ children[call_child]->fops->stat,
+ loc);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ fstat */
+
+int32_t
+afr_fstat_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int deitransform_child = -1;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ deitransform_child = (long) cookie;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ retry:
+ last_tried = local->cont.fstat.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.fstat.last_tried;
+
+ if (this_try == deitransform_child) {
+ /*
+ skip the deitransform'd child since if we are here
+ we must have already tried that child
+ */
+ goto retry;
+ }
+
+
+ unwind = 0;
+
+ STACK_WIND_COOKIE (frame, afr_fstat_cbk,
+ (void *) (long) deitransform_child,
+ children[this_try],
+ children[this_try]->fops->fstat,
+ local->fd);
+ }
+
+out:
+ if (unwind) {
+ if (op_ret != -1)
+ buf->st_ino = local->cont.fstat.ino;
+
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_fstat (call_frame_t *frame, xlator_t *this,
+ fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int call_child = 0;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ frame->local = local;
+
+ VALIDATE_OR_GOTO (fd->inode, out);
+
+ call_child = afr_deitransform (fd->inode->ino, priv->child_count);
+
+ /*
+ if fstat fails from the deitranform'd child, we try
+ all children starting with the first one
+ */
+ local->cont.fstat.last_tried = -1;
+ local->cont.fstat.ino = fd->inode->ino;
+ local->fd = fd_ref (fd);
+
+ STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child,
+ children[call_child],
+ children[call_child]->fops->fstat,
+ fd);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ readlink */
+
+int32_t
+afr_readlink_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ const char *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ last_tried = local->cont.readlink.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.readlink.last_tried;
+
+ unwind = 0;
+ STACK_WIND_COOKIE (frame, afr_readlink_cbk,
+ (void *) (long) this_try,
+ children[this_try],
+ children[this_try]->fops->readlink,
+ &local->loc,
+ local->cont.readlink.size);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_readlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, size_t size)
+{
+ afr_private_t * priv = NULL;
+ xlator_t ** children = NULL;
+ int call_child = 0;
+ afr_local_t *local = NULL;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ frame->local = local;
+
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.readlink.last_tried = call_child;
+ loc_copy (&local->loc, loc);
+ local->cont.readlink.size = size;
+
+ STACK_WIND_COOKIE (frame, afr_readlink_cbk,
+ (void *) (long) call_child,
+ children[call_child], children[call_child]->fops->readlink,
+ loc, size);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ getxattr */
+
+int32_t
+afr_getxattr_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ last_tried = local->cont.getxattr.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.getxattr.last_tried;
+
+ unwind = 0;
+ STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
+ (void *) (long) this_try,
+ children[this_try],
+ children[this_try]->fops->getxattr,
+ &local->loc,
+ local->cont.getxattr.name);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, dict);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name)
+{
+ afr_private_t * priv = NULL;
+ xlator_t ** children = NULL;
+ int call_child = 0;
+ afr_local_t * local = NULL;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ frame->local = local;
+
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.getxattr.last_tried = call_child;
+ loc_copy (&local->loc, loc);
+ if (name)
+ local->cont.getxattr.name = strdup (name);
+
+ STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
+ (void *) (long) call_child,
+ children[call_child], children[call_child]->fops->getxattr,
+ loc, name);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ readv */
+
+/**
+ * read algorithm:
+ *
+ * if the user has specified a read subvolume, use it
+ * otherwise -
+ * use the inode number to hash it to one of the subvolumes, and
+ * read from there (to balance read load)
+ *
+ * if any of the above read's fail, try the children in sequence
+ * beginning at the beginning
+ */
+
+int32_t
+afr_readv_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct iovec *vector, int32_t count, struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int unwind = 1;
+ int last_tried = -1;
+ int this_try = -1;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+ VALIDATE_OR_GOTO (priv->children, out);
+
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ retry:
+ last_tried = local->cont.readv.last_tried;
+
+ if (all_tried (last_tried, priv->child_count)) {
+ goto out;
+ }
+ this_try = ++local->cont.readv.last_tried;
+
+ if (this_try == priv->read_child) {
+ /*
+ skip the read child since if we are here
+ we must have already tried that child
+ */
+ goto retry;
+ }
+
+ unwind = 0;
+
+ STACK_WIND_COOKIE (frame, afr_readv_cbk,
+ (void *) (long) this_try,
+ children[this_try],
+ children[this_try]->fops->readv,
+ local->fd, local->cont.readv.size,
+ local->cont.readv.offset);
+ }
+
+out:
+ if (unwind) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, vector, count, buf);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ xlator_t ** children = NULL;
+
+ int call_child = 0;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ priv = this->private;
+ children = priv->children;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ frame->local = local;
+
+ if (priv->read_child != -1) {
+ call_child = priv->read_child;
+
+ /*
+ if read fails from the read child, we try
+ all children starting with the first one
+ */
+ local->cont.readv.last_tried = -1;
+ } else {
+ call_child = afr_first_up_child (priv);
+ if (call_child == -1) {
+ op_errno = ENOTCONN;
+ gf_log (this->name, GF_LOG_ERROR,
+ "no child is up :(");
+ goto out;
+ }
+
+ local->cont.readv.last_tried = call_child;
+ }
+
+ local->fd = fd_ref (fd);
+
+ local->cont.readv.size = size;
+ local->cont.readv.offset = offset;
+
+ STACK_WIND_COOKIE (frame, afr_readv_cbk,
+ (void *) (long) call_child,
+ children[call_child],
+ children[call_child]->fops->readv,
+ fd, size, offset);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, 0, NULL);
+ }
+ return 0;
+}
+
+/* }}} */
diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h
new file mode 100644
index 000000000..6b3bd2da8
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-inode-read.h
@@ -0,0 +1,47 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __INODE_READ_H__
+#define __INODE_READ_H__
+
+int32_t
+afr_access (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t mask);
+
+int32_t
+afr_stat (call_frame_t *frame, xlator_t *this,
+ loc_t *loc);
+
+int32_t
+afr_fstat (call_frame_t *frame, xlator_t *this,
+ fd_t *fd);
+
+int32_t
+afr_readlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, size_t size);
+
+int32_t
+afr_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset);
+
+int32_t
+afr_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name);
+
+#endif /* __INODE_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
new file mode 100644
index 000000000..267350b2c
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -0,0 +1,2024 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+
+#include "afr.h"
+#include "afr-transaction.h"
+
+
+/* {{{ chmod */
+
+
+int
+afr_chmod_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.chmod.buf.st_ino = local->cont.chmod.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.chmod.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_chmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.chmod.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ afr_chmod_unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_chmod_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int i = 0;
+ int call_count = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_chmod_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->chmod,
+ &local->loc,
+ local->cont.chmod.mode);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_chmod_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int32_t
+afr_chmod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->cont.chmod.mode = mode;
+ local->cont.chmod.ino = loc->inode->ino;
+
+ local->transaction.fop = afr_chmod_wind;
+ local->transaction.done = afr_chmod_done;
+ local->transaction.unwind = afr_chmod_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+
+/* {{{ fchmod */
+
+int
+afr_fchmod_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.fchmod.buf.st_ino = local->cont.fchmod.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.fchmod.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_fchmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.fchmod.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ afr_fchmod_unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_fchmod_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int i = 0;
+ int call_count = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_fchmod_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fchmod,
+ local->fd,
+ local->cont.fchmod.mode);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_fchmod_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int32_t
+afr_fchmod (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, mode_t mode)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t * transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->cont.fchmod.mode = mode;
+ local->cont.fchmod.ino = fd->inode->ino;
+
+ local->transaction.fop = afr_fchmod_wind;
+ local->transaction.done = afr_fchmod_done;
+ local->transaction.unwind = afr_fchmod_unwind;
+
+ local->fd = fd_ref (fd);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ chown */
+
+int
+afr_chown_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.chown.buf.st_ino = local->cont.chown.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.chown.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_chown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.chown.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind) {
+ local->transaction.unwind (frame, this);
+ }
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_chown_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_chown_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->chown,
+ &local->loc, local->cont.chown.uid,
+ local->cont.chown.gid);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_chown_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_chown (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, uid_t uid, gid_t gid)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->cont.chown.uid = uid;
+ local->cont.chown.gid = gid;
+ local->cont.chown.ino = loc->inode->ino;
+
+ local->transaction.fop = afr_chown_wind;
+ local->transaction.done = afr_chown_done;
+ local->transaction.unwind = afr_chown_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ chown */
+
+int
+afr_fchown_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.fchown.buf.st_ino = local->cont.fchown.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.fchown.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_fchown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.fchown.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind) {
+ local->transaction.unwind (frame, this);
+ }
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_fchown_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_fchown_wind_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fchown,
+ local->fd, local->cont.fchown.uid,
+ local->cont.fchown.gid);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_fchown_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_fchown (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, uid_t uid, gid_t gid)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->cont.fchown.uid = uid;
+ local->cont.fchown.gid = gid;
+ local->cont.fchown.ino = fd->inode->ino;
+
+ local->transaction.fop = afr_fchown_wind;
+ local->transaction.done = afr_fchown_done;
+ local->transaction.unwind = afr_fchown_unwind;
+
+ local->fd = fd_ref (fd);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ writev */
+
+int
+afr_writev_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.writev.buf.st_ino = local->cont.writev.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.writev.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.writev.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_writev_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int i = 0;
+ int call_count = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_writev_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->writev,
+ local->fd,
+ local->cont.writev.vector,
+ local->cont.writev.count,
+ local->cont.writev.offset);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_writev_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (local->cont.writev.refs)
+ dict_unref (local->cont.writev.refs);
+ local->cont.writev.refs = NULL;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op = GF_FOP_WRITE;
+ local->cont.writev.vector = iov_dup (vector, count);
+ local->cont.writev.count = count;
+ local->cont.writev.offset = offset;
+ local->cont.writev.ino = fd->inode->ino;
+
+ if (frame->root->req_refs)
+ local->cont.writev.refs = dict_ref (frame->root->req_refs);
+
+ local->transaction.fop = afr_writev_wind;
+ local->transaction.done = afr_writev_done;
+ local->transaction.unwind = afr_writev_unwind;
+
+ local->fd = fd_ref (fd);
+
+ local->transaction.main_frame = frame;
+ if (fd->flags & O_APPEND) {
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ } else {
+ local->transaction.start = offset;
+ local->transaction.len = iov_length (vector, count);
+ }
+
+ local->transaction.pending = AFR_DATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ truncate */
+
+int
+afr_truncate_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.truncate.buf.st_ino = local->cont.truncate.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.truncate.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.truncate.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_truncate_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->truncate,
+ &local->loc,
+ local->cont.truncate.offset);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_truncate_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_truncate (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, off_t offset)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op_ret = -1;
+
+ local->cont.truncate.offset = offset;
+ local->cont.truncate.ino = loc->inode->ino;
+
+ local->transaction.fop = afr_truncate_wind;
+ local->transaction.done = afr_truncate_done;
+ local->transaction.unwind = afr_truncate_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = offset;
+ local->transaction.pending = AFR_DATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ ftruncate */
+
+
+int
+afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.ftruncate.buf.st_ino = local->cont.ftruncate.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.ftruncate.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.ftruncate.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_ftruncate_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->ftruncate,
+ local->fd, local->cont.ftruncate.offset);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_ftruncate_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_ftruncate (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, off_t offset)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op = GF_FOP_FTRUNCATE;
+ local->op_ret = -1;
+
+ local->cont.ftruncate.offset = offset;
+ local->cont.ftruncate.ino = fd->inode->ino;
+
+ local->transaction.fop = afr_ftruncate_wind;
+ local->transaction.done = afr_ftruncate_done;
+ local->transaction.unwind = afr_ftruncate_unwind;
+
+ local->fd = fd_ref (fd);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = offset;
+ local->transaction.pending = AFR_DATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ utimens */
+
+
+int
+afr_utimens_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ local->cont.utimens.buf.st_ino = local->cont.utimens.ino;
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
+ &local->cont.utimens.buf);
+ }
+ return 0;
+}
+
+
+int
+afr_utimens_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int need_unwind = 1;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (child_went_down (op_ret, op_errno))
+ afr_transaction_child_died (frame, this, child_index);
+
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ local->cont.utimens.buf = *buf;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_utimens_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_utimens_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->utimens,
+ &local->loc,
+ local->cont.utimens.tv);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_utimens_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+
+ local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_utimens (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, struct timespec tv[2])
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op_ret = -1;
+
+ local->cont.utimens.tv[0] = tv[0];
+ local->cont.utimens.tv[1] = tv[1];
+
+ local->cont.utimens.ino = loc->inode->ino;
+
+ local->transaction.fop = afr_utimens_wind;
+ local->transaction.done = afr_utimens_done;
+ local->transaction.unwind = afr_utimens_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ setxattr */
+
+
+int
+afr_setxattr_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno)
+ }
+ return 0;
+}
+
+
+int
+afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_setxattr_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->setxattr,
+ &local->loc,
+ local->cont.setxattr.dict,
+ local->cont.setxattr.flags);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_setxattr_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_setxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *dict, int32_t flags)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op_ret = -1;
+
+ local->cont.setxattr.dict = dict_ref (dict);
+ local->cont.setxattr.flags = flags;
+
+ local->transaction.fop = afr_setxattr_wind;
+ local->transaction.done = afr_setxattr_done;
+ local->transaction.unwind = afr_setxattr_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ removexattr */
+
+
+int
+afr_removexattr_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.main_frame)
+ main_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ if (main_frame) {
+ AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno)
+ }
+ return 0;
+}
+
+
+int
+afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+ int need_unwind = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret != -1) {
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+ }
+ local->success_count++;
+
+ if (local->success_count == priv->wait_count) {
+ need_unwind = 1;
+ }
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ if (need_unwind)
+ local->transaction.unwind (frame, this);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_removexattr_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->removexattr,
+ &local->loc,
+ local->cont.removexattr.name);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_removexattr_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = frame->local;
+
+ local->transaction.unwind (frame, this);
+
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+int
+afr_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
+
+ int ret = -1;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ priv = this->private;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+
+ local->op_ret = -1;
+
+ local->cont.removexattr.name = strdup (name);
+
+ local->transaction.fop = afr_removexattr_wind;
+ local->transaction.done = afr_removexattr_done;
+ local->transaction.unwind = afr_removexattr_unwind;
+
+ loc_copy (&local->loc, loc);
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ local->transaction.pending = AFR_METADATA_PENDING;
+
+ afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+
+ return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h
new file mode 100644
index 000000000..9c0b5cad3
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-inode-write.h
@@ -0,0 +1,63 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __INODE_WRITE_H__
+#define __INODE_WRITE_H__
+
+int32_t
+afr_chmod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode);
+
+int32_t
+afr_chown (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, uid_t uid, gid_t gid);
+
+int
+afr_fchown (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, uid_t uid, gid_t gid);
+
+int32_t
+afr_fchmod (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, mode_t mode);
+
+int32_t
+afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset);
+
+int32_t
+afr_truncate (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, off_t offset);
+
+int32_t
+afr_ftruncate (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, off_t offset);
+
+int32_t
+afr_utimens (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, struct timespec tv[2]);
+
+int32_t
+afr_setxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *dict, int32_t flags);
+
+int32_t
+afr_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name);
+
+#endif /* __INODE_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
new file mode 100644
index 000000000..45d065169
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -0,0 +1,1073 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "byte-order.h"
+
+#include "afr.h"
+#include "afr-transaction.h"
+#include "afr-self-heal-common.h"
+#include "afr-self-heal.h"
+
+
+/**
+ * select_source - select a source and return it
+ * TODO: take into account option 'favorite-child'
+ */
+
+int
+afr_sh_select_source (int sources[], int child_count)
+{
+ int i;
+ for (i = 0; i < child_count; i++)
+ if (sources[i])
+ return i;
+
+ return -1;
+}
+
+
+/**
+ * sink_count - return number of sinks in sources array
+ */
+
+int
+afr_sh_sink_count (int sources[], int child_count)
+{
+ int i;
+ int sinks = 0;
+ for (i = 0; i < child_count; i++)
+ if (!sources[i])
+ sinks++;
+ return sinks;
+}
+
+int
+afr_sh_source_count (int sources[], int child_count)
+{
+ int i;
+ int nsource = 0;
+
+ for (i = 0; i < child_count; i++)
+ if (sources[i])
+ nsource++;
+ return nsource;
+}
+
+
+int
+afr_sh_supress_errenous_children (int sources[], int child_errno[],
+ int child_count)
+{
+ int i = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (child_errno[i] && sources[i]) {
+ sources[i] = 0;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_supress_empty_children (int sources[], dict_t *xattr[],
+ struct stat *buf,
+ int child_count, const char *key)
+{
+ int i = 0;
+ int32_t *pending = NULL;
+ int ret = 0;
+ int all_xattr_missing = 1;
+
+ /* if the file was created by afr with xattrs */
+ for (i = 0; i < child_count; i++) {
+ if (!xattr[i])
+ continue;
+
+ ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending));
+ if (ret != 0) {
+ continue;
+ }
+
+ all_xattr_missing = 0;
+ break;
+ }
+
+ if (all_xattr_missing) {
+ /* supress 0byte files.. this avoids empty file created
+ by dir selfheal to overwrite the 'good' file */
+ for (i = 0; i < child_count; i++) {
+ if (!buf[i].st_size)
+ sources[i] = 0;
+ }
+ goto out;
+ }
+
+
+ for (i = 0; i < child_count; i++) {
+ if (!xattr[i]) {
+ sources[i] = 0;
+ continue;
+ }
+
+ ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending));
+ if (ret != 0) {
+ sources[i] = 0;
+ continue;
+ }
+
+ if (!pending) {
+ sources[i] = 0;
+ continue;
+ }
+ }
+
+out:
+ return 0;
+}
+
+
+void
+afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)
+{
+ afr_private_t * priv = this->private;
+
+ char *buf = NULL;
+ char *ptr = NULL;
+
+ int i, j;
+
+ /* 10 digits per entry + 1 space + '[' and ']' */
+ buf = MALLOC (priv->child_count * 11 + 8);
+
+ for (i = 0; i < priv->child_count; i++) {
+ ptr = buf;
+ ptr += sprintf (ptr, "[ ");
+ for (j = 0; j < priv->child_count; j++) {
+ ptr += sprintf (ptr, "%d ", pending_matrix[i][j]);
+ }
+ ptr += sprintf (ptr, "]");
+ gf_log (this->name, GF_LOG_DEBUG,
+ "pending_matrix: %s", buf);
+ }
+
+ FREE (buf);
+}
+
+
+void
+afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[],
+ int child_count, const char *key)
+{
+ int i = 0;
+ int j = 0;
+ int32_t *pending = NULL;
+ int ret = -1;
+
+ /* start clean */
+ for (i = 0; i < child_count; i++) {
+ for (j = 0; j < child_count; j++) {
+ pending_matrix[i][j] = 0;
+ }
+ }
+
+ for (i = 0; i < child_count; i++) {
+ if (!xattr[i])
+ continue;
+
+ pending = NULL;
+
+ ret = dict_get_ptr (xattr[i], (char *) key,
+ VOID(&pending));
+ if (ret != 0)
+ continue;
+
+ for (j = 0; j < child_count; j++) {
+ pending_matrix[i][j] = ntoh32 (pending[j]);
+ }
+ }
+}
+
+
+/**
+ * mark_sources: Mark all 'source' nodes and return number of source
+ * nodes found
+ */
+
+int
+afr_sh_mark_sources (int32_t *pending_matrix[], int sources[], int child_count)
+{
+ int i = 0;
+ int j = 0;
+
+ int nsources = 0;
+
+
+ /* start clean */
+ for (i = 0; i < child_count; i++) {
+ sources[i] = 0;
+ }
+
+ /*
+ Let's 'normalize' the pending matrix first,
+ by disregarding all pending entries that refer
+ to themselves
+ */
+ for (i = 0; i < child_count; i++) {
+ pending_matrix[i][i] = 0;
+ }
+
+ for (i = 0; i < child_count; i++) {
+ for (j = 0; j < child_count; j++) {
+ if (pending_matrix[j][i])
+ break;
+ }
+
+ if (j == child_count) {
+ nsources++;
+ sources[i] = 1;
+ }
+ }
+
+ return nsources;
+}
+
+
+void
+afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[],
+ int success[], int child_count)
+{
+ int i = 0;
+ int j = 0;
+
+ /* start clean */
+ for (i = 0; i < child_count; i++) {
+ for (j = 0; j < child_count; j++) {
+ delta_matrix[i][j] = 0;
+ }
+ }
+
+ for (i = 0; i < child_count; i++) {
+ for (j = 0; j < child_count; j++) {
+ if (!success[j])
+ continue;
+ delta_matrix[i][j] = -pending_matrix[i][j];
+ }
+ }
+}
+
+
+int
+afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[],
+ int child_count, const char *key)
+{
+ int i = 0;
+ int j = 0;
+
+ int ret = 0;
+
+ int32_t *pending = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (!xattr[i])
+ continue;
+
+ pending = CALLOC (sizeof (int32_t), child_count);
+ for (j = 0; j < child_count; j++) {
+ pending[j] = hton32 (delta_matrix[i][j]);
+ }
+
+ ret = dict_set_bin (xattr[i], (char *) key, pending,
+ child_count * sizeof (int32_t));
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int32_t *pending = NULL;
+ void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */
+
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xattr, AFR_METADATA_PENDING, &tmp_pending);
+
+ if (ret != 0)
+ return 0;
+
+ pending = tmp_pending;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == child_count)
+ continue;
+ if (pending[i])
+ return 1;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int32_t *pending = NULL;
+ void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */
+
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xattr, AFR_DATA_PENDING, &tmp_pending);
+
+ if (ret != 0)
+ return 0;
+
+ pending = tmp_pending;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == child_count)
+ continue;
+ if (pending[i])
+ return 1;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int32_t *pending = NULL;
+ void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */
+
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xattr, AFR_ENTRY_PENDING, &tmp_pending);
+
+ if (ret != 0)
+ return 0;
+
+ pending = tmp_pending;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == child_count)
+ continue;
+ if (pending[i])
+ return 1;
+ }
+
+ return 0;
+}
+
+
+
+/**
+ * is_matrix_zero - return true if pending matrix is all zeroes
+ */
+
+int
+afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count)
+{
+ int i, j;
+
+ for (i = 0; i < child_count; i++)
+ for (j = 0; j < child_count; j++)
+ if (pending_matrix[i][j])
+ return 0;
+ return 1;
+}
+
+
+int
+afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+// memset (sh->child_errno, 0, sizeof (int) * priv->child_count);
+ memset (sh->buf, 0, sizeof (struct stat) * priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i])
+ dict_unref (sh->xattr[i]);
+ sh->xattr[i] = NULL;
+ }
+
+ if (local->govinda_gOvinda) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "aborting selfheal of %s",
+ local->loc.path);
+ sh->completion_cbk (frame, this);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to metadata check on %s",
+ local->loc.path);
+ afr_self_heal_metadata (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+sh_missing_entries_unlck_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_missing_entries_done (frame, this);
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_finish (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_self_heal_t *sh = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unlocking %"PRId64"/%s on subvolume %s",
+ sh->parent_loc.inode->ino, local->loc.name,
+ priv->children[i]->name);
+
+ STACK_WIND (frame, sh_missing_entries_unlck_cbk,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &sh->parent_loc, local->loc.name,
+ ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+
+ if (!--call_count)
+ break;
+ }
+ }
+ return 0;
+}
+
+
+static int
+sh_destroy_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int op_errno, struct stat *stbuf)
+{
+ STACK_DESTROY (frame->root);
+ return 0;
+}
+
+
+static int
+sh_missing_entries_newentry_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ call_frame_t *chown_frame = NULL;
+ int call_count = 0;
+ int child_index = 0;
+ struct stat *buf = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ buf = &sh->buf[sh->source];
+ child_index = (long) cookie;
+
+ if (op_ret == 0) {
+ chown_frame = copy_frame (frame);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "chown %s to %d %d on subvolume %s",
+ local->loc.path, buf->st_uid, buf->st_gid,
+ priv->children[child_index]->name);
+
+ STACK_WIND (chown_frame, sh_destroy_cbk,
+ priv->children[child_index],
+ priv->children[child_index]->fops->chown,
+ &local->loc,
+ buf->st_uid, buf->st_gid);
+ }
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ sh_missing_entries_finish (frame, this);
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_mknod (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int enoent_count = 0;
+ int call_count = 0;
+ mode_t st_mode = 0;
+ dev_t st_dev = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++)
+ if (sh->child_errno[i] == ENOENT)
+ enoent_count++;
+
+ call_count = enoent_count;
+ local->call_count = call_count;
+
+ st_mode = sh->buf[sh->source].st_mode;
+ st_dev = sh->buf[sh->source].st_dev;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "mknod %s mode 0%o on %d subvolumes",
+ local->loc.path, st_mode, enoent_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->child_errno[i] == ENOENT) {
+ STACK_WIND_COOKIE (frame,
+ sh_missing_entries_newentry_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->mknod,
+ &local->loc, st_mode, st_dev);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_mkdir (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int enoent_count = 0;
+ int call_count = 0;
+ mode_t st_mode = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++)
+ if (sh->child_errno[i] == ENOENT)
+ enoent_count++;
+
+ call_count = enoent_count;
+ local->call_count = call_count;
+
+ st_mode = sh->buf[sh->source].st_mode;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "mkdir %s mode 0%o on %d subvolumes",
+ local->loc.path, st_mode, enoent_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->child_errno[i] == ENOENT) {
+ STACK_WIND_COOKIE (frame,
+ sh_missing_entries_newentry_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->mkdir,
+ &local->loc, st_mode);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_symlink (call_frame_t *frame, xlator_t *this,
+ const char *link)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int enoent_count = 0;
+ int call_count = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++)
+ if (sh->child_errno[i] == ENOENT)
+ enoent_count++;
+
+ call_count = enoent_count;
+ local->call_count = call_count;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "symlink %s -> %s on %d subvolumes",
+ local->loc.path, link, enoent_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->child_errno[i] == ENOENT) {
+ STACK_WIND_COOKIE (frame,
+ sh_missing_entries_newentry_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->symlink,
+ link, &local->loc);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_readlink_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ const char *link)
+{
+ if (op_ret > 0)
+ sh_missing_entries_symlink (frame, this, link);
+ else
+ sh_missing_entries_finish (frame, this);
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_readlink (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ STACK_WIND (frame, sh_missing_entries_readlink_cbk,
+ priv->children[sh->source],
+ priv->children[sh->source]->fops->readlink,
+ &local->loc, 4096);
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_create (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int type = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int enoent_count = 0;
+ int govinda_gOvinda = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->child_errno[i]) {
+ if (sh->child_errno[i] == ENOENT)
+ enoent_count++;
+ } else {
+ if (type) {
+ if (type != (sh->buf[i].st_mode & S_IFMT))
+ govinda_gOvinda = 1;
+ } else {
+ sh->source = i;
+ type = sh->buf[i].st_mode & S_IFMT;
+ }
+ }
+ }
+
+ if (govinda_gOvinda) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "conflicing filetypes exist for path %s. returning.",
+ local->loc.path);
+
+ local->govinda_gOvinda = 1;
+ sh_missing_entries_finish (frame, this);
+ return 0;
+ }
+
+ if (!type) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no source found for %s. all nodes down?. returning.",
+ local->loc.path);
+ /* subvolumes down and/or file does not exist */
+ sh_missing_entries_finish (frame, this);
+ return 0;
+ }
+
+ if (enoent_count == 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "no missing files - %s. proceeding to metadata check",
+ local->loc.path);
+ /* proceed to next step - metadata self-heal */
+ sh_missing_entries_finish (frame, this);
+ return 0;
+ }
+
+ switch (type) {
+ case S_IFSOCK:
+ case S_IFREG:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFIFO:
+ sh_missing_entries_mknod (frame, this);
+ break;
+ case S_IFLNK:
+ sh_missing_entries_readlink (frame, this);
+ break;
+ case S_IFDIR:
+ sh_missing_entries_mkdir (frame, this);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR,
+ "unknown file type: 0%o", type);
+ local->govinda_gOvinda = 1;
+ sh_missing_entries_finish (frame, this);
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ int child_index = 0;
+ afr_local_t *local = NULL;
+ int call_count = 0;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "path %s on subvolume %s is of mode 0%o",
+ local->loc.path,
+ priv->children[child_index]->name,
+ buf->st_mode);
+
+ local->self_heal.buf[child_index] = *buf;
+ } else {
+ gf_log (this->name, GF_LOG_WARNING,
+ "path %s on subvolume %s => -1 (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+
+ local->self_heal.child_errno[child_index] = op_errno;
+ }
+
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ sh_missing_entries_create (frame, this);
+ }
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_lookup (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xattr_req = NULL;
+ int ret = -1;
+
+ local = frame->local;
+ call_count = local->child_count;
+ priv = this->private;
+
+ local->call_count = call_count;
+
+ xattr_req = dict_new();
+
+ if (xattr_req)
+ ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING,
+ priv->child_count * sizeof(int32_t));
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "looking up %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame,
+ sh_missing_entries_lookup_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &local->loc, xattr_req);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ return 0;
+}
+
+
+static int
+sh_missing_entries_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ sh->op_failed = 1;
+
+ gf_log (this->name,
+ (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
+ "locking inode of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inode of %s on child %d locked",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed == 1) {
+ sh_missing_entries_finish (frame, this);
+ return 0;
+ }
+
+ sh_missing_entries_lookup (frame, this);
+ }
+
+ return 0;
+}
+
+
+static int
+afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "attempting to recreate missing entries for path=%s",
+ local->loc.path);
+
+ afr_build_parent_loc (&sh->parent_loc, &local->loc);
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, sh_missing_entries_lk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &sh->parent_loc, local->loc.name,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_self_heal (call_frame_t *frame, xlator_t *this,
+ int (*completion_cbk) (call_frame_t *, xlator_t *))
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "performing self heal on %s (metadata=%d data=%d entry=%d)",
+ local->loc.path,
+ local->need_metadata_self_heal,
+ local->need_data_self_heal,
+ local->need_entry_self_heal);
+
+ sh->completion_cbk = completion_cbk;
+
+ sh->buf = CALLOC (priv->child_count, sizeof (struct stat));
+ sh->child_errno = CALLOC (priv->child_count, sizeof (int));
+ sh->success = CALLOC (priv->child_count, sizeof (int));
+ sh->xattr = CALLOC (priv->child_count, sizeof (dict_t *));
+ sh->sources = CALLOC (sizeof (*sh->sources), priv->child_count);
+
+ sh->pending_matrix = CALLOC (sizeof (int32_t *), priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ sh->pending_matrix[i] = CALLOC (sizeof (int32_t),
+ priv->child_count);
+ }
+
+ sh->delta_matrix = CALLOC (sizeof (int32_t *), priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ sh->delta_matrix[i] = CALLOC (sizeof (int32_t),
+ priv->child_count);
+ }
+
+ if (local->success_count && local->enoent_count) {
+ afr_self_heal_missing_entries (frame, this);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to metadata check on %s",
+ local->loc.path);
+ afr_sh_missing_entries_done (frame, this);
+ }
+
+ return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h
new file mode 100644
index 000000000..9dd597f07
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-common.h
@@ -0,0 +1,66 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __AFR_SELF_HEAL_COMMON_H__
+#define __AFR_SELF_HEAL_COMMON_H__
+
+#define FILE_HAS_HOLES(buf) (((buf)->st_size) > ((buf)->st_blocks * 512))
+
+int
+afr_sh_select_source (int sources[], int child_count);
+
+int
+afr_sh_sink_count (int sources[], int child_count);
+
+int
+afr_sh_source_count (int sources[], int child_count);
+
+int
+afr_sh_supress_errenous_children (int sources[], int child_errno[],
+ int child_count);
+
+int
+afr_sh_supress_empty_children (int sources[], dict_t *xattr[],
+ struct stat *buf,
+ int child_count, const char *key);
+
+void
+afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this);
+
+void
+afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[],
+ int child_count, const char *key);
+
+void
+afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[],
+ int32_t success[], int child_count);
+
+int
+afr_sh_mark_sources (int32_t *pending_matrix[], int sources[],
+ int child_count);
+
+int
+afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[],
+ int child_count, const char *key);
+
+int
+afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count);
+
+
+#endif /* __AFR_SELF_HEAL_COMMON_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
new file mode 100644
index 000000000..3a48da485
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -0,0 +1,1030 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+#include "byte-order.h"
+
+#include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-self-heal-common.h"
+
+
+
+int
+afr_sh_data_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ /*
+ TODO: cleanup sh->*
+ */
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "self heal of %s completed",
+ local->loc.path);
+
+ sh->completion_cbk (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ fd_unref (sh->healing_fd);
+ sh->healing_fd = NULL;
+ afr_sh_data_done (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_close (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_self_heal_t *sh = NULL;
+ int i = 0;
+ int call_count = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ if (!sh->healing_fd) {
+ afr_sh_data_done (frame, this);
+ return 0;
+ }
+
+ call_count = sh->active_sinks + 1;
+ local->call_count = call_count;
+
+
+ /* closed source */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "closing fd of %s on %s",
+ local->loc.path, priv->children[sh->source]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
+ (void *) (long) sh->source,
+ priv->children[sh->source],
+ priv->children[sh->source]->fops->flush,
+ sh->healing_fd);
+ call_count--;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] || !local->child_up[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "closing fd of %s on %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->flush,
+ sh->healing_fd);
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "locking inode of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inode of %s on child %d locked",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_data_close (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_unlock (call_frame_t *frame, xlator_t *this)
+{
+ struct flock flock;
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t * sh = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ flock.l_start = 0;
+ flock.l_len = 0;
+ flock.l_type = F_UNLCK;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unlocking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_unlck_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ &local->loc, F_SETLK, &flock);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_finish (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "finishing data selfheal of %s", local->loc.path);
+
+ afr_sh_data_unlock (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_data_finish (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
+ dict_t **erase_xattr = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+
+ afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix,
+ sh->success, priv->child_count);
+
+ erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i]) {
+ call_count++;
+
+ erase_xattr[i] = get_new_dict();
+ dict_ref (erase_xattr[i]);
+ }
+ }
+
+ afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr,
+ priv->child_count, AFR_DATA_PENDING);
+
+ local->call_count = call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!erase_xattr[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "erasing pending flags from %s on %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
+ if (!--call_count)
+ break;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (erase_xattr[i]) {
+ dict_unref (erase_xattr[i]);
+ }
+ }
+ FREE (erase_xattr);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+ int child_index = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1)
+ gf_log (this->name, GF_LOG_ERROR,
+ "ftruncate of %s on subvolume %s failed (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ else
+ gf_log (this->name, GF_LOG_DEBUG,
+ "ftruncate of %s on subvolume %s completed",
+ local->loc.path,
+ priv->children[child_index]->name);
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_data_erase_pending (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int *sources = NULL;
+ int call_count = 0;
+ int i = 0;
+
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sources = sh->sources;
+ call_count = sh->active_sinks;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i] || !local->child_up[i])
+ continue;
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->ftruncate,
+ sh->healing_fd, sh->file_size);
+
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this);
+
+int
+afr_sh_data_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "wrote %d bytes of data from %s to child %d, offset %"PRId64"",
+ op_ret, local->loc.path, child_index, sh->offset - op_ret);
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "write to %s failed on subvolume %s (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ sh->op_failed = 1;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_data_read_write_iter (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_read_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct iovec *vector, int32_t count, struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ int child_index = (long) cookie;
+ int i = 0;
+ int call_count = 0;
+
+ off_t offset;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ call_count = sh->active_sinks;
+
+ local->call_count = call_count;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "read %d bytes of data from %s on child %d, offset %"PRId64"",
+ op_ret, local->loc.path, child_index, sh->offset);
+
+ if (op_ret <= 0) {
+ afr_sh_data_trim_sinks (frame, this);
+ return 0;
+ }
+
+ /* what if we read less than block size? */
+ offset = sh->offset;
+ sh->offset += op_ret;
+
+ frame->root->req_refs = frame->root->rsp_refs;
+
+ if (sh->file_has_holes) {
+ if (iov_0filled (vector, count) == 0) {
+ /* the iter function depends on the
+ sh->offset already being updated
+ above
+ */
+ afr_sh_data_read_write_iter (frame, this);
+ goto out;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] || !local->child_up[i])
+ continue;
+
+ /* this is a sink, so write to it */
+ STACK_WIND_COOKIE (frame, afr_sh_data_write_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->writev,
+ sh->healing_fd, vector, count, offset);
+
+ if (!--call_count)
+ break;
+ }
+
+out:
+ return 0;
+}
+
+
+int
+afr_sh_data_read_write (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_read_cbk,
+ (void *) (long) sh->source,
+ priv->children[sh->source],
+ priv->children[sh->source]->fops->readv,
+ sh->healing_fd, sh->block_size,
+ sh->offset);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ if (sh->op_failed) {
+ afr_sh_data_finish (frame, this);
+ goto out;
+ }
+
+ if (sh->offset >= sh->file_size) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "closing fd's of %s",
+ local->loc.path);
+ afr_sh_data_trim_sinks (frame, this);
+
+ goto out;
+ }
+
+ afr_sh_data_read_write (frame, this);
+
+out:
+ return 0;
+}
+
+
+int
+afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ /* TODO: some of the open's might fail.
+ In that case, modify cleanup fn to send flush on those
+ fd's which are already open */
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "open of %s failed on child %s (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ sh->op_failed = 1;
+ }
+
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed) {
+ afr_sh_data_finish (frame, this);
+ return 0;
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "fd for %s opened, commencing sync",
+ local->loc.path);
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "sourcing file %s from %s to other sinks",
+ local->loc.path, priv->children[sh->source]->name);
+
+ afr_sh_data_read_write (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_open (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int call_count = 0;
+
+ int source = -1;
+ int *sources = NULL;
+
+ fd_t *fd = NULL;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = sh->active_sinks + 1;
+ local->call_count = call_count;
+
+ fd = fd_create (local->loc.inode, frame->root->pid);
+ sh->healing_fd = fd;
+
+ source = local->self_heal.source;
+ sources = local->self_heal.sources;
+
+ sh->block_size = 65536;
+ sh->file_size = sh->buf[source].st_size;
+
+ if (FILE_HAS_HOLES (&sh->buf[source]))
+ sh->file_has_holes = 1;
+
+ /* open source */
+ STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk,
+ (void *) (long) source,
+ priv->children[source],
+ priv->children[source]->fops->open,
+ &local->loc, O_RDONLY|O_LARGEFILE, fd);
+ call_count--;
+
+ /* open sinks */
+ for (i = 0; i < priv->child_count; i++) {
+ if(sources[i] || !local->child_up[i])
+ continue;
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->open,
+ &local->loc,
+ O_WRONLY|O_LARGEFILE, fd);
+
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int active_sinks = 0;
+ int source = 0;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = sh->source;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] == 0 && local->child_up[i] == 1) {
+ active_sinks++;
+ sh->success[i] = 1;
+ }
+ }
+ sh->success[source] = 1;
+
+ if (active_sinks == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no active sinks for performing self-heal on file %s",
+ local->loc.path);
+ afr_sh_data_finish (frame, this);
+ return 0;
+ }
+ sh->active_sinks = active_sinks;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "syncing data of %s from subvolume %s to %d active sinks",
+ local->loc.path, priv->children[source]->name, active_sinks);
+
+ afr_sh_data_open (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int nsources = 0;
+ int source = 0;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr,
+ priv->child_count, AFR_DATA_PENDING);
+
+ afr_sh_print_pending_matrix (sh->pending_matrix, this);
+
+
+ afr_sh_mark_sources (sh->pending_matrix, sh->sources,
+ priv->child_count);
+
+ afr_sh_supress_empty_children (sh->sources, sh->xattr, sh->buf,
+ priv->child_count, AFR_DATA_PENDING);
+
+ afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
+ priv->child_count);
+
+ nsources = afr_sh_source_count (sh->sources, priv->child_count);
+
+ if ((nsources == 0)
+ && (priv->favorite_child != -1)
+ && (sh->child_errno[priv->favorite_child] == 0)) {
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "Picking favorite child %s as authentic source to resolve conflicting data of %s",
+ priv->children[priv->favorite_child]->name,
+ local->loc.path);
+
+ sh->sources[priv->favorite_child] = 1;
+
+ nsources = afr_sh_source_count (sh->sources,
+ priv->child_count);
+ }
+
+ if (nsources == 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to resolve conflicting data of %s. "
+ "Please resolve manually by deleting the file %s "
+ "from all but the preferred subvolume. "
+ "Please consider 'option favorite-child <>'",
+ local->loc.path, local->loc.path);
+
+ local->govinda_gOvinda = 1;
+
+ afr_sh_data_finish (frame, this);
+ return 0;
+ }
+
+ source = afr_sh_select_source (sh->sources, priv->child_count);
+ sh->source = source;
+
+ /* detect changes not visible through pending flags -- JIC */
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || sh->child_errno[i])
+ continue;
+
+ if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source]))
+ sh->sources[i] = 0;
+ }
+
+ afr_sh_data_sync_prepare (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_lookup_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret != -1) {
+ sh->xattr[child_index] = dict_ref (xattr);
+ sh->buf[child_index] = *buf;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_data_fix (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_lookup (call_frame_t *frame, xlator_t *this)
+{
+ afr_self_heal_t *sh = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr_req = NULL;
+
+ int call_count = 0;
+ int i = 0;
+ int ret = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ xattr_req = dict_new();
+ if (xattr_req)
+ ret = dict_set_uint64 (xattr_req, AFR_DATA_PENDING,
+ priv->child_count * sizeof(int32_t));
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_sh_data_lookup_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &local->loc, xattr_req);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ return 0;
+}
+
+
+int
+afr_sh_data_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+ /* TODO: what if lock fails? */
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ sh->op_failed = 1;
+
+ gf_log (this->name,
+ (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
+ "locking of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inode of %s on child %d locked",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed) {
+ afr_sh_data_finish (frame, this);
+ return 0;
+ }
+
+ afr_sh_data_lookup (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_data_lock (call_frame_t *frame, xlator_t *this)
+{
+ struct flock flock;
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t * sh = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ flock.l_start = 0;
+ flock.l_len = 0;
+ flock.l_type = F_WRLCK;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "locking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_data_lock_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ &local->loc, F_SETLK, &flock);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_self_heal_data (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = this->private;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ if (local->need_data_self_heal && priv->data_self_heal) {
+ afr_sh_data_lock (frame, this);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "not doing data self heal on %s",
+ local->loc.path);
+ afr_sh_data_done (frame, this);
+ }
+
+ return 0;
+}
+
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
new file mode 100644
index 000000000..ec341922e
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -0,0 +1,2038 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+#include "byte-order.h"
+
+#include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-self-heal-common.h"
+
+
+
+int
+afr_sh_entry_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ /*
+ TODO: cleanup sh->*
+ */
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "self heal of %s completed",
+ local->loc.path);
+
+ sh->completion_cbk (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+ /* TODO: what if lock fails? */
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "unlocking inode of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unlocked inode of %s on child %d",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->healing_fd)
+ fd_unref (sh->healing_fd);
+ sh->healing_fd = NULL;
+ afr_sh_entry_done (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t * sh = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unlocking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_entry_unlck_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &local->loc, NULL,
+ ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_finish (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "finishing entry selfheal of %s", local->loc.path);
+
+ afr_sh_entry_unlock (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_entry_finish (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
+ dict_t **erase_xattr = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+
+ afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix,
+ sh->success, priv->child_count);
+
+ erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i]) {
+ call_count++;
+
+ erase_xattr[i] = get_new_dict();
+ dict_ref (erase_xattr[i]);
+ }
+ }
+
+ afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr,
+ priv->child_count, AFR_ENTRY_PENDING);
+
+ local->call_count = call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!erase_xattr[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "erasing pending flags from %s on %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
+ if (!--call_count)
+ break;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (erase_xattr[i]) {
+ dict_unref (erase_xattr[i]);
+ }
+ }
+ FREE (erase_xattr);
+
+ return 0;
+}
+
+
+
+static int
+next_active_source (call_frame_t *frame, xlator_t *this,
+ int current_active_source)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int source = -1;
+ int next_active_source = -1;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ source = sh->source;
+
+ if (source != -1) {
+ if (current_active_source != source)
+ next_active_source = source;
+ goto out;
+ }
+
+ /*
+ the next active sink becomes the source for the
+ 'conservative decision' of merging all entries
+ */
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((sh->sources[i] == 0)
+ && (local->child_up[i] == 1)
+ && (i > current_active_source)) {
+
+ next_active_source = i;
+ break;
+ }
+ }
+out:
+ return next_active_source;
+}
+
+
+
+static int
+next_active_sink (call_frame_t *frame, xlator_t *this,
+ int current_active_sink)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int next_active_sink = -1;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ /*
+ the next active sink becomes the source for the
+ 'conservative decision' of merging all entries
+ */
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((sh->sources[i] == 0)
+ && (local->child_up[i] == 1)
+ && (i > current_active_sink)) {
+
+ next_active_sink = i;
+ break;
+ }
+ }
+
+ return next_active_sink;
+}
+
+
+int
+build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name)
+{
+ int ret = -1;
+
+ if (!child) {
+ goto out;
+ }
+
+ if (strcmp (parent->path, "/") == 0)
+ asprintf ((char **)&child->path, "/%s", name);
+ else
+ asprintf ((char **)&child->path, "%s/%s", parent->path, name);
+
+ if (!child->path) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ child->name = strrchr (child->path, '/');
+ if (child->name)
+ child->name++;
+
+ child->parent = inode_ref (parent->inode);
+ child->inode = inode_new (parent->inode->table);
+
+ if (!child->inode) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (ret == -1)
+ loc_wipe (child);
+
+ return ret;
+}
+
+
+int
+afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this);
+
+int
+afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
+ int active_src);
+
+int
+afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_entry_expunge_subvol (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+ afr_self_heal_t *expunge_sh = NULL;
+ int active_src = 0;
+ call_frame_t *frame = NULL;
+
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+ expunge_sh = &expunge_local->self_heal;
+ frame = expunge_sh->sh_frame;
+
+ active_src = (long) cookie;
+
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "removed %s on %s",
+ expunge_local->loc.path,
+ priv->children[active_src]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "removing %s on %s failed (%s)",
+ expunge_local->loc.path,
+ priv->children[active_src]->name,
+ strerror (op_errno));
+ }
+
+ AFR_STACK_DESTROY (expunge_frame);
+ afr_sh_entry_expunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "removing directory %s on %s",
+ expunge_local->loc.path, priv->children[active_src]->name);
+
+ STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
+ (void *) (long) active_src,
+ priv->children[active_src],
+ priv->children[active_src]->fops->rmdir,
+ &expunge_local->loc);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "unlinking file %s on %s",
+ expunge_local->loc.path, priv->children[active_src]->name);
+
+ STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
+ (void *) (long) active_src,
+ priv->children[active_src],
+ priv->children[active_src]->fops->unlink,
+ &expunge_local->loc);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this,
+ int active_src, struct stat *buf)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+ afr_self_heal_t *expunge_sh = NULL;
+ int source = 0;
+ call_frame_t *frame = NULL;
+ int type = 0;
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+ expunge_sh = &expunge_local->self_heal;
+ frame = expunge_sh->sh_frame;
+ source = expunge_sh->source;
+
+ type = (buf->st_mode & S_IFMT);
+
+ switch (type) {
+ case S_IFSOCK:
+ case S_IFREG:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFIFO:
+ case S_IFLNK:
+ afr_sh_entry_expunge_unlink (expunge_frame, this, active_src);
+
+ break;
+ case S_IFDIR:
+ afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s has unknown file type on %s: 0%o",
+ expunge_local->loc.path,
+ priv->children[source]->name, type);
+ goto out;
+ break;
+ }
+
+ return 0;
+out:
+ AFR_STACK_DESTROY (expunge_frame);
+ afr_sh_entry_expunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *x)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+ afr_self_heal_t *expunge_sh = NULL;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+ expunge_sh = &expunge_local->self_heal;
+ frame = expunge_sh->sh_frame;
+ active_src = (long) cookie;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "lookup of %s on %s failed (%s)",
+ expunge_local->loc.path,
+ priv->children[active_src]->name,
+ strerror (op_errno));
+ goto out;
+ }
+
+ afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf);
+
+ return 0;
+out:
+ AFR_STACK_DESTROY (expunge_frame);
+ afr_sh_entry_expunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "looking up %s on %s",
+ expunge_local->loc.path, priv->children[active_src]->name);
+
+ STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk,
+ (void *) (long) active_src,
+ priv->children[active_src],
+ priv->children[active_src]->fops->lookup,
+ &expunge_local->loc, 0);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *x)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *expunge_local = NULL;
+ afr_self_heal_t *expunge_sh = NULL;
+ int source = 0;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+
+
+ priv = this->private;
+ expunge_local = expunge_frame->local;
+ expunge_sh = &expunge_local->self_heal;
+ frame = expunge_sh->sh_frame;
+ active_src = expunge_sh->active_source;
+ source = (long) cookie;
+
+ if (op_ret == -1 && op_errno == ENOENT) {
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "missing entry %s on %s",
+ expunge_local->loc.path,
+ priv->children[source]->name);
+
+ afr_sh_entry_expunge_purge (expunge_frame, this, active_src);
+
+ return 0;
+ }
+
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s exists under %s",
+ expunge_local->loc.path,
+ priv->children[source]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "looking up %s under %s failed (%s)",
+ expunge_local->loc.path,
+ priv->children[source]->name,
+ strerror (op_errno));
+ }
+
+ AFR_STACK_DESTROY (expunge_frame);
+ afr_sh_entry_expunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this,
+ char *name)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int ret = -1;
+ call_frame_t *expunge_frame = NULL;
+ afr_local_t *expunge_local = NULL;
+ afr_self_heal_t *expunge_sh = NULL;
+ int active_src = 0;
+ int source = 0;
+ int op_errno = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ active_src = sh->active_source;
+ source = sh->source;
+
+ if ((strcmp (name, ".") == 0)
+ || (strcmp (name, "..") == 0)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "skipping inspection of %s under %s",
+ name, local->loc.path);
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inspecting existance of %s under %s",
+ name, local->loc.path);
+
+ expunge_frame = copy_frame (frame);
+ if (!expunge_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (expunge_local, afr_local_t, out);
+
+ expunge_frame->local = expunge_local;
+ expunge_sh = &expunge_local->self_heal;
+ expunge_sh->sh_frame = frame;
+ expunge_sh->active_source = active_src;
+
+ ret = build_child_loc (this, &expunge_local->loc, &local->loc, name);
+ if (ret != 0) {
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "looking up %s on %s", expunge_local->loc.path,
+ priv->children[source]->name);
+
+ STACK_WIND_COOKIE (expunge_frame,
+ afr_sh_entry_expunge_entry_cbk,
+ (void *) (long) source,
+ priv->children[source],
+ priv->children[source]->fops->lookup,
+ &expunge_local->loc, 0);
+
+ ret = 0;
+out:
+ if (ret == -1)
+ afr_sh_entry_expunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ gf_dirent_t *entries)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ gf_dirent_t *entry = NULL;
+ off_t last_offset = 0;
+ int active_src = 0;
+ int entry_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ active_src = sh->active_source;
+
+ if (op_ret <= 0) {
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "readdir of %s on subvolume %s failed (%s)",
+ local->loc.path,
+ priv->children[active_src]->name,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir of %s on subvolume %s complete",
+ local->loc.path,
+ priv->children[active_src]->name);
+ }
+
+ afr_sh_entry_expunge_all (frame, this);
+ return 0;
+ }
+
+ list_for_each_entry (entry, &entries->list, list) {
+ last_offset = entry->d_off;
+ entry_count++;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir'ed %d entries from %s",
+ entry_count, priv->children[active_src]->name);
+
+ sh->offset = last_offset;
+ local->call_count = entry_count;
+
+ list_for_each_entry (entry, &entries->list, list) {
+ afr_sh_entry_expunge_entry (frame, this, entry->d_name);
+ }
+
+ return 0;
+}
+
+int
+afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk,
+ priv->children[active_src],
+ priv->children[active_src]->fops->readdir,
+ sh->healing_fd, sh->block_size, sh->offset);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int active_src = -1;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sh->offset = 0;
+
+ if (sh->source == -1) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no active sources for %s to expunge entries",
+ local->loc.path);
+ goto out;
+ }
+
+ active_src = next_active_sink (frame, this, sh->active_source);
+ sh->active_source = active_src;
+
+ if (sh->op_failed) {
+ goto out;
+ }
+
+ if (active_src == -1) {
+ /* completed creating missing files on all subvolumes */
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "expunging entries of %s on %s to other sinks",
+ local->loc.path, priv->children[active_src]->name);
+
+ afr_sh_entry_expunge_subvol (frame, this, active_src);
+
+ return 0;
+out:
+ afr_sh_entry_erase_pending (frame, this);
+ return 0;
+
+}
+
+
+int
+afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this);
+
+int
+afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this,
+ int active_src);
+
+int
+afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_entry_impunge_subvol (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_utimens_cbk (call_frame_t *impunge_frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct stat *stbuf)
+{
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+ int child_index = 0;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+ child_index = (long) cookie;
+
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "utimes set for %s on %s",
+ impunge_local->loc.path,
+ priv->children[child_index]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting utimes of %s on %s failed (%s)",
+ impunge_local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ }
+
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_chown_cbk (call_frame_t *impunge_frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct stat *stbuf)
+{
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+ int child_index = 0;
+ struct timespec ts[2];
+
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+ child_index = (long) cookie;
+
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "ownership of %s on %s changed",
+ impunge_local->loc.path,
+ priv->children[child_index]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting ownership of %s on %s failed (%s)",
+ impunge_local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ goto out;
+ }
+
+#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC
+ ts[0] = impunge_local->cont.lookup.buf.st_atim;
+ ts[1] = impunge_local->cont.lookup.buf.st_mtim;
+#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC
+ ts[0] = impunge_local->cont.lookup.buf.st_atimespec;
+ ts[1] = impunge_local->cont.lookup.buf.st_mtimespec;
+#else
+ ts[0].tv_sec = impunge_local->cont.lookup.buf.st_atime;
+ ts[1].tv_sec = impunge_local->cont.lookup.buf.st_mtime;
+#endif
+ STACK_WIND_COOKIE (impunge_frame,
+ afr_sh_entry_impunge_utimens_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->utimens,
+ &impunge_local->loc, ts);
+
+ return 0;
+
+out:
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *stbuf)
+{
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+ int child_index = 0;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+
+ child_index = (long) cookie;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "creation of %s on %s failed (%s)",
+ impunge_local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting ownership of %s on %s to %d/%d",
+ impunge_local->loc.path,
+ priv->children[child_index]->name,
+ impunge_local->cont.lookup.buf.st_uid,
+ impunge_local->cont.lookup.buf.st_gid);
+
+ inode->st_mode = stbuf->st_mode;
+
+ STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_chown_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->chown,
+ &impunge_local->loc,
+ impunge_local->cont.lookup.buf.st_uid,
+ impunge_local->cont.lookup.buf.st_gid);
+ return 0;
+
+out:
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this,
+ int child_index, struct stat *stbuf)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "creating file %s mode=0%o dev=0x%"GF_PRI_DEV" on %s",
+ impunge_local->loc.path,
+ stbuf->st_mode, stbuf->st_rdev,
+ priv->children[child_index]->name);
+
+ STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->mknod,
+ &impunge_local->loc,
+ stbuf->st_mode, stbuf->st_rdev);
+
+ return 0;
+}
+
+
+
+int
+afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this,
+ int child_index, struct stat *stbuf)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "creating directory %s mode=0%o on %s",
+ impunge_local->loc.path,
+ stbuf->st_mode,
+ priv->children[child_index]->name);
+
+ STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->mkdir,
+ &impunge_local->loc, stbuf->st_mode);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this,
+ int child_index, const char *linkname)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "creating symlink %s -> %s on %s",
+ impunge_local->loc.path, linkname,
+ priv->children[child_index]->name);
+
+ STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->symlink,
+ linkname, &impunge_local->loc);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ const char *linkname)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int child_index = -1;
+ call_frame_t *frame = NULL;
+ int call_count = -1;
+ int active_src = -1;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+ active_src = impunge_sh->active_source;
+
+ child_index = (long) cookie;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "readlink of %s on %s failed (%s)",
+ impunge_local->loc.path,
+ priv->children[active_src]->name,
+ strerror (op_errno));
+ goto out;
+ }
+
+ afr_sh_entry_impunge_symlink (impunge_frame, this, child_index,
+ linkname);
+ return 0;
+
+out:
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this,
+ int child_index, struct stat *stbuf)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int active_src = -1;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ active_src = impunge_sh->active_source;
+
+ STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk,
+ (void *) (long) child_index,
+ priv->children[active_src],
+ priv->children[active_src]->fops->readlink,
+ &impunge_local->loc, 4096);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_recreate_lookup_cbk (call_frame_t *impunge_frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf,
+ dict_t *xattr)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int active_src = 0;
+ int type = 0;
+ int child_index = 0;
+ call_frame_t *frame = NULL;
+ int call_count = 0;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+
+ child_index = (long) cookie;
+
+ active_src = impunge_sh->active_source;
+
+ if (op_ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "looking up %s on %s (for %s) failed (%s)",
+ impunge_local->loc.path,
+ priv->children[active_src]->name,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ goto out;
+ }
+
+ impunge_local->cont.lookup.buf = *buf;
+ type = (buf->st_mode & S_IFMT);
+
+ switch (type) {
+ case S_IFSOCK:
+ case S_IFREG:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFIFO:
+ afr_sh_entry_impunge_mknod (impunge_frame, this,
+ child_index, buf);
+ break;
+ case S_IFLNK:
+ afr_sh_entry_impunge_readlink (impunge_frame, this,
+ child_index, buf);
+ break;
+ case S_IFDIR:
+ afr_sh_entry_impunge_mkdir (impunge_frame, this,
+ child_index, buf);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s has unknown file type on %s: 0%o",
+ impunge_local->loc.path,
+ priv->children[active_src]->name, type);
+ goto out;
+ break;
+ }
+
+ return 0;
+
+out:
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_recreate (call_frame_t *impunge_frame, xlator_t *this,
+ int child_index)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int active_src = 0;
+
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+
+ active_src = impunge_sh->active_source;
+
+ STACK_WIND_COOKIE (impunge_frame,
+ afr_sh_entry_impunge_recreate_lookup_cbk,
+ (void *) (long) child_index,
+ priv->children[active_src],
+ priv->children[active_src]->fops->lookup,
+ &impunge_local->loc, 0);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_entry_cbk (call_frame_t *impunge_frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *x)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int call_count = 0;
+ int child_index = 0;
+ call_frame_t *frame = NULL;
+ int active_src = 0;
+
+ priv = this->private;
+ impunge_local = impunge_frame->local;
+ impunge_sh = &impunge_local->self_heal;
+ frame = impunge_sh->sh_frame;
+ child_index = (long) cookie;
+ active_src = impunge_sh->active_source;
+
+ if (op_ret == -1 && op_errno == ENOENT) {
+ /* decrease call_count in recreate-callback */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "missing entry %s on %s",
+ impunge_local->loc.path,
+ priv->children[child_index]->name);
+
+ afr_sh_entry_impunge_recreate (impunge_frame, this,
+ child_index);
+ return 0;
+ }
+
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s exists under %s",
+ impunge_local->loc.path,
+ priv->children[child_index]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "looking up %s under %s failed (%s)",
+ impunge_local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ }
+
+ LOCK (&impunge_frame->lock);
+ {
+ call_count = --impunge_local->call_count;
+ }
+ UNLOCK (&impunge_frame->lock);
+
+ if (call_count == 0) {
+ AFR_STACK_DESTROY (impunge_frame);
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this,
+ char *name)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int ret = -1;
+ call_frame_t *impunge_frame = NULL;
+ afr_local_t *impunge_local = NULL;
+ afr_self_heal_t *impunge_sh = NULL;
+ int active_src = 0;
+ int i = 0;
+ int call_count = 0;
+ int op_errno = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ active_src = sh->active_source;
+
+ if ((strcmp (name, ".") == 0)
+ || (strcmp (name, "..") == 0)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "skipping inspection of %s under %s",
+ name, local->loc.path);
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inspecting existance of %s under %s",
+ name, local->loc.path);
+
+ impunge_frame = copy_frame (frame);
+ if (!impunge_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (impunge_local, afr_local_t, out);
+
+ impunge_frame->local = impunge_local;
+ impunge_sh = &impunge_local->self_heal;
+ impunge_sh->sh_frame = frame;
+ impunge_sh->active_source = active_src;
+
+ ret = build_child_loc (this, &impunge_local->loc, &local->loc, name);
+ if (ret != 0) {
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == active_src)
+ continue;
+ if (local->child_up[i] == 0)
+ continue;
+ if (sh->sources[i] == 1)
+ continue;
+ call_count++;
+ }
+
+ impunge_local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == active_src)
+ continue;
+ if (local->child_up[i] == 0)
+ continue;
+ if (sh->sources[i] == 1)
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "looking up %s on %s", impunge_local->loc.path,
+ priv->children[i]->name);
+
+ STACK_WIND_COOKIE (impunge_frame,
+ afr_sh_entry_impunge_entry_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &impunge_local->loc, 0);
+
+ if (!--call_count)
+ break;
+ }
+
+ ret = 0;
+out:
+ if (ret == -1)
+ afr_sh_entry_impunge_entry_done (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ gf_dirent_t *entries)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ gf_dirent_t *entry = NULL;
+ off_t last_offset = 0;
+ int active_src = 0;
+ int entry_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ active_src = sh->active_source;
+
+ if (op_ret <= 0) {
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "readdir of %s on subvolume %s failed (%s)",
+ local->loc.path,
+ priv->children[active_src]->name,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir of %s on subvolume %s complete",
+ local->loc.path,
+ priv->children[active_src]->name);
+ }
+
+ afr_sh_entry_impunge_all (frame, this);
+ return 0;
+ }
+
+ list_for_each_entry (entry, &entries->list, list) {
+ last_offset = entry->d_off;
+ entry_count++;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "readdir'ed %d entries from %s",
+ entry_count, priv->children[active_src]->name);
+
+ sh->offset = last_offset;
+ local->call_count = entry_count;
+
+ list_for_each_entry (entry, &entries->list, list) {
+ afr_sh_entry_impunge_entry (frame, this, entry->d_name);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this,
+ int active_src)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk,
+ priv->children[active_src],
+ priv->children[active_src]->fops->readdir,
+ sh->healing_fd, sh->block_size, sh->offset);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int active_src = -1;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sh->offset = 0;
+
+ active_src = next_active_source (frame, this, sh->active_source);
+ sh->active_source = active_src;
+
+ if (sh->op_failed) {
+ afr_sh_entry_finish (frame, this);
+ return 0;
+ }
+
+ if (active_src == -1) {
+ /* completed creating missing files on all subvolumes */
+ afr_sh_entry_expunge_all (frame, this);
+ return 0;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "impunging entries of %s on %s to other sinks",
+ local->loc.path, priv->children[active_src]->name);
+
+ afr_sh_entry_impunge_subvol (frame, this, active_src);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ /* TODO: some of the open's might fail.
+ In that case, modify cleanup fn to send flush on those
+ fd's which are already open */
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "opendir of %s failed on child %s (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ sh->op_failed = 1;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed) {
+ afr_sh_entry_finish (frame, this);
+ return 0;
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "fd for %s opened, commencing sync",
+ local->loc.path);
+
+ sh->active_source = -1;
+ afr_sh_entry_impunge_all (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_open (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int call_count = 0;
+
+ int source = -1;
+ int *sources = NULL;
+
+ fd_t *fd = NULL;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = local->self_heal.source;
+ sources = local->self_heal.sources;
+
+ sh->block_size = 131072;
+ sh->offset = 0;
+
+ call_count = sh->active_sinks;
+ if (source != -1)
+ call_count++;
+
+ local->call_count = call_count;
+
+ fd = fd_create (local->loc.inode, frame->root->pid);
+ sh->healing_fd = fd;
+
+ if (source != -1) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "opening directory %s on subvolume %s (source)",
+ local->loc.path, priv->children[source]->name);
+
+ /* open source */
+ STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
+ (void *) (long) source,
+ priv->children[source],
+ priv->children[source]->fops->opendir,
+ &local->loc, fd);
+ call_count--;
+ }
+
+ /* open sinks */
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i] || !local->child_up[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "opening directory %s on subvolume %s (sink)",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->opendir,
+ &local->loc, fd);
+
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int active_sinks = 0;
+ int source = 0;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = sh->source;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] == 0 && local->child_up[i] == 1) {
+ active_sinks++;
+ sh->success[i] = 1;
+ }
+ }
+ if (source != -1)
+ sh->success[source] = 1;
+
+ if (active_sinks == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no active sinks for self-heal on dir %s",
+ local->loc.path);
+ afr_sh_entry_finish (frame, this);
+ return 0;
+ }
+ if (source == -1 && active_sinks < 2) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "cannot sync with 0 sources and 1 sink on dir %s",
+ local->loc.path);
+ afr_sh_entry_finish (frame, this);
+ return 0;
+ }
+ sh->active_sinks = active_sinks;
+
+ if (source != -1)
+ gf_log (this->name, GF_LOG_DEBUG,
+ "syncing %s from subvolume %s to %d active sinks",
+ local->loc.path, priv->children[source]->name,
+ active_sinks);
+ else
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no active sources for %s found. "
+ "merging all entries as a conservative decision",
+ local->loc.path);
+
+ afr_sh_entry_open (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_fix (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int source = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr,
+ priv->child_count, AFR_ENTRY_PENDING);
+
+ afr_sh_print_pending_matrix (sh->pending_matrix, this);
+
+
+ afr_sh_mark_sources (sh->pending_matrix, sh->sources,
+ priv->child_count);
+
+ afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
+ priv->child_count);
+
+ source = afr_sh_select_source (sh->sources, priv->child_count);
+ sh->source = source;
+
+ afr_sh_entry_sync_prepare (frame, this);
+
+ return 0;
+}
+
+
+
+int
+afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret != -1) {
+ sh->xattr[child_index] = dict_ref (xattr);
+ sh->buf[child_index] = *buf;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ afr_sh_entry_fix (frame, this);
+ }
+
+ return 0;
+}
+
+
+
+int
+afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this)
+{
+ afr_self_heal_t * sh = NULL;
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ dict_t *xattr_req = NULL;
+ int ret = 0;
+ int call_count = 0;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ xattr_req = dict_new();
+ if (xattr_req)
+ ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING,
+ priv->child_count * sizeof(int32_t));
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame,
+ afr_sh_entry_lookup_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &local->loc, xattr_req);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ return 0;
+}
+
+
+
+int
+afr_sh_entry_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+ /* TODO: what if lock fails? */
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ sh->op_failed = 1;
+
+ gf_log (this->name,
+ (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
+ "locking inode of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inode of %s on child %d locked",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed == 1) {
+ afr_sh_entry_finish (frame, this);
+ return 0;
+ }
+
+ afr_sh_entry_lookup (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_entry_lock (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ afr_self_heal_t * sh = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "locking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_entry_lock_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &local->loc, NULL,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_self_heal_entry (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ if (local->need_entry_self_heal && priv->entry_self_heal) {
+ afr_sh_entry_lock (frame, this);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to completion on %s",
+ local->loc.path);
+ afr_sh_entry_done (frame, this);
+ }
+
+ return 0;
+}
+
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
new file mode 100644
index 000000000..e65a426db
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -0,0 +1,791 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+#include "byte-order.h"
+
+#include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-self-heal-common.h"
+
+
+int
+afr_sh_metadata_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+// memset (sh->child_errno, 0, sizeof (int) * priv->child_count);
+ memset (sh->buf, 0, sizeof (struct stat) * priv->child_count);
+ memset (sh->success, 0, sizeof (int) * priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i])
+ dict_unref (sh->xattr[i]);
+ sh->xattr[i] = NULL;
+ }
+
+ if (local->govinda_gOvinda) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "aborting selfheal of %s",
+ local->loc.path);
+ sh->completion_cbk (frame, this);
+ } else {
+ if (S_ISREG (local->cont.lookup.buf.st_mode)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to data check on %s",
+ local->loc.path);
+ afr_self_heal_data (frame, this);
+ return 0;
+ }
+
+ if (S_ISDIR (local->cont.lookup.buf.st_mode)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to entry check on %s",
+ local->loc.path);
+ afr_self_heal_entry (frame, this);
+ return 0;
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "completed self heal of %s",
+ local->loc.path);
+
+ sh->completion_cbk (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ int call_count = 0;
+
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_metadata_done (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+ struct flock flock = {0, };
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ flock.l_start = 0;
+ flock.l_len = 0;
+ flock.l_type = F_UNLCK;
+
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unlocking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND (frame, afr_sh_metadata_unlck_cbk,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ &local->loc, F_SETLK, &flock);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_metadata_finish (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
+ dict_t **erase_xattr = NULL;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+
+ afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix,
+ sh->success, priv->child_count);
+
+ erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i]) {
+ call_count++;
+
+ erase_xattr[i] = get_new_dict();
+ dict_ref (erase_xattr[i]);
+ }
+ }
+
+ afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr,
+ priv->child_count, AFR_METADATA_PENDING);
+
+ local->call_count = call_count;
+
+ if (call_count == 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "metadata of %s not healed on any subvolume",
+ local->loc.path);
+
+ afr_sh_metadata_finish (frame, this);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!erase_xattr[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "erasing pending flags from %s on %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
+ if (!--call_count)
+ break;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (erase_xattr[i]) {
+ dict_unref (erase_xattr[i]);
+ }
+ }
+ FREE (erase_xattr);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "setting attributes failed for %s on %s (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+
+ sh->success[child_index] = 0;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_metadata_erase_pending (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int source = 0;
+ int active_sinks = 0;
+ int call_count = 0;
+ int i = 0;
+ struct timespec ts[2];
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = sh->source;
+ active_sinks = sh->active_sinks;
+
+ /*
+ * 4 calls per sink - chown, chmod, utimes, setxattr
+ */
+ if (xattr)
+ call_count = active_sinks * 4;
+ else
+ call_count = active_sinks * 3;
+
+ local->call_count = call_count;
+
+#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC
+ ts[0] = sh->buf[source].st_atim;
+ ts[1] = sh->buf[source].st_mtim;
+#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC
+ ts[0] = sh->buf[source].st_atimespec;
+ ts[1] = sh->buf[source].st_mtimespec;
+#else
+ ts[0].tv_sec = sh->buf[source].st_atime;
+ ts[1].tv_sec = sh->buf[source].st_mtime;
+#endif
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (call_count == 0) {
+ break;
+ }
+ if (sh->sources[i] || !local->child_up[i])
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "syncing metadata of %s from %s to %s",
+ local->loc.path, priv->children[source]->name,
+ priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->chown,
+ &local->loc,
+ sh->buf[source].st_uid,
+ sh->buf[source].st_gid);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->chmod,
+ &local->loc, sh->buf[source].st_mode);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->utimens,
+ &local->loc, ts);
+
+ call_count = call_count - 3;
+
+ if (!xattr)
+ continue;
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->setxattr,
+ &local->loc, xattr, 0);
+ call_count--;
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int source = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = sh->source;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "getxattr of %s failed on subvolume %s (%s). proceeding without xattr",
+ local->loc.path, priv->children[source]->name,
+ strerror (op_errno));
+
+ afr_sh_metadata_sync (frame, this, NULL);
+ } else {
+ dict_del (xattr, AFR_DATA_PENDING);
+ dict_del (xattr, AFR_METADATA_PENDING);
+ dict_del (xattr, AFR_ENTRY_PENDING);
+ afr_sh_metadata_sync (frame, this, xattr);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int active_sinks = 0;
+ int source = 0;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ source = sh->source;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] == 0 && local->child_up[i] == 1) {
+ active_sinks++;
+ sh->success[i] = 1;
+ }
+ }
+ sh->success[source] = 1;
+
+ if (active_sinks == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no active sinks for performing self-heal on file %s",
+ local->loc.path);
+ afr_sh_metadata_finish (frame, this);
+ return 0;
+ }
+ sh->active_sinks = active_sinks;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "syncing metadata of %s from subvolume %s to %d active sinks",
+ local->loc.path, priv->children[source]->name, active_sinks);
+
+ STACK_WIND (frame, afr_sh_metadata_getxattr_cbk,
+ priv->children[source],
+ priv->children[source]->fops->getxattr,
+ &local->loc, NULL);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int nsources = 0;
+ int source = 0;
+ int i = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr,
+ priv->child_count, AFR_METADATA_PENDING);
+
+ afr_sh_print_pending_matrix (sh->pending_matrix, this);
+
+ afr_sh_mark_sources (sh->pending_matrix, sh->sources,
+ priv->child_count);
+
+ afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
+ priv->child_count);
+
+ nsources = afr_sh_source_count (sh->sources, priv->child_count);
+
+ if ((nsources == 0)
+ && (priv->favorite_child != -1)
+ && (sh->child_errno[priv->favorite_child] == 0)) {
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "Picking favorite child %s as authentic source to resolve conflicting metadata of %s",
+ priv->children[priv->favorite_child]->name,
+ local->loc.path);
+
+ sh->sources[priv->favorite_child] = 1;
+
+ nsources = afr_sh_source_count (sh->sources,
+ priv->child_count);
+ }
+
+ if (nsources == 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to resolve conflicting metadata of %s. "
+ "Please resolve manually by fixing the "
+ "permissions/ownership of %s on your subvolumes. "
+ "You can also consider 'option favorite-child <>'",
+ local->loc.path, local->loc.path);
+
+ local->govinda_gOvinda = 1;
+
+ afr_sh_metadata_finish (frame, this);
+ return 0;
+ }
+
+ source = afr_sh_select_source (sh->sources, priv->child_count);
+ sh->source = source;
+
+ /* detect changes not visible through pending flags -- JIC */
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || sh->child_errno[i])
+ continue;
+
+ if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source]))
+ sh->sources[i] = 0;
+
+ if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source]))
+ sh->sources[i] = 0;
+ }
+
+ afr_sh_metadata_sync_prepare (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = 0;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "path %s on subvolume %s is of mode 0%o",
+ local->loc.path,
+ priv->children[child_index]->name,
+ buf->st_mode);
+
+ sh->buf[child_index] = *buf;
+ if (xattr)
+ sh->xattr[child_index] = dict_ref (xattr);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "path %s on subvolume %s => -1 (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+
+ sh->child_errno[child_index] = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_sh_metadata_fix (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+ dict_t *xattr_req = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+ local->call_count = call_count;
+
+ xattr_req = dict_new();
+
+ if (xattr_req)
+ ret = dict_set_uint64 (xattr_req, AFR_METADATA_PENDING,
+ priv->child_count * sizeof(int32_t));
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "looking up %s on %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_lookup_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &local->loc, xattr_req);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+ /* TODO: what if lock fails? */
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ sh->op_failed = 1;
+
+ gf_log (this->name,
+ (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
+ "locking of %s on child %d failed: %s",
+ local->loc.path, child_index,
+ strerror (op_errno));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "inode of %s on child %d locked",
+ local->loc.path, child_index);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (sh->op_failed) {
+ afr_sh_metadata_finish (frame, this);
+ return 0;
+ }
+
+ afr_sh_metadata_lookup (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+ struct flock flock = {0, };
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+ priv = this->private;
+
+ call_count = local->child_count;
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ flock.l_start = 0;
+ flock.l_len = 0;
+ flock.l_type = F_WRLCK;
+
+ if (local->child_up[i]) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "locking %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_sh_metadata_lk_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ &local->loc, F_SETLK, &flock);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_self_heal_metadata (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = this->private;
+
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ if (local->need_metadata_self_heal && priv->metadata_self_heal) {
+ afr_sh_metadata_lock (frame, this);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "proceeding to data check on %s",
+ local->loc.path);
+ afr_sh_metadata_done (frame, this);
+ }
+
+ return 0;
+}
+
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
new file mode 100644
index 000000000..1c97a9bc1
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -0,0 +1,52 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __AFR_SELF_HEAL_H__
+#define __AFR_SELF_HEAL_H__
+
+#include <sys/stat.h>
+
+#define FILETYPE_DIFFERS(buf1,buf2) ((S_IFMT & ((struct stat *)buf1)->st_mode) != (S_IFMT & ((struct stat *)buf2)->st_mode))
+#define PERMISSION_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_mode) != (((struct stat *)buf2)->st_mode))
+#define OWNERSHIP_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_uid) != (((struct stat *)buf2)->st_uid) || (((struct stat *)buf1)->st_gid != (((struct stat *)buf2)->st_gid)))
+#define SIZE_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_size) != (((struct stat *)buf2)->st_size))
+
+
+
+int
+afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this);
+int
+afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this);
+int
+afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this);
+
+int
+afr_self_heal_entry (call_frame_t *frame, xlator_t *this);
+
+int
+afr_self_heal_data (call_frame_t *frame, xlator_t *this);
+
+int
+afr_self_heal_metadata (call_frame_t *frame, xlator_t *this);
+
+int
+afr_self_heal (call_frame_t *frame, xlator_t *this,
+ int (*completion_cbk) (call_frame_t *, xlator_t *));
+
+#endif /* __AFR_SELF_HEAL_H__ */
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
new file mode 100644
index 000000000..3df9f07e5
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -0,0 +1,957 @@
+/*
+ Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include "dict.h"
+#include "byte-order.h"
+
+#include "afr.h"
+#include "afr-transaction.h"
+
+#include <signal.h>
+
+
+static void
+__mark_all_pending (int32_t *pending, int child_count)
+{
+ int i;
+
+ for (i = 0; i < child_count; i++)
+ pending[i] = hton32 (1);
+}
+
+
+static void
+__mark_child_dead (int32_t *pending, int child_count, int child)
+{
+ pending[child] = 0;
+}
+
+
+static void
+__mark_down_children (int32_t *pending, int child_count, unsigned char *child_up)
+{
+ int i;
+
+ for (i = 0; i < child_count; i++)
+ if (!child_up[i])
+ pending[i] = 0;
+}
+
+
+static void
+__mark_all_success (int32_t *pending, int child_count)
+{
+ int i;
+
+ for (i = 0; i < child_count; i++)
+ pending[i] = hton32 (-1);
+}
+
+
+static int
+__is_first_write_on_fd (xlator_t *this, fd_t *fd)
+{
+ int op_ret = 0;
+ int _ret = -1;
+
+ _ret = fd_ctx_get (fd, this, NULL);
+ if (_ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "first writev() on fd=%p, writing changelog",
+ fd);
+
+ _ret = fd_ctx_set (fd, this, 0xaf1);
+ op_ret = 1;
+ }
+
+ return op_ret;
+}
+
+
+static int
+__changelog_enabled (afr_private_t *priv, afr_transaction_type type)
+{
+ int ret = 0;
+
+ switch (type) {
+ case AFR_DATA_TRANSACTION:
+ if (priv->data_change_log)
+ ret = 1;
+
+ break;
+
+ case AFR_METADATA_TRANSACTION:
+ if (priv->metadata_change_log)
+ ret = 1;
+
+ break;
+
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ if (priv->entry_change_log)
+ ret = 1;
+
+ break;
+
+ case AFR_FLUSH_TRANSACTION:
+ ret = 1;
+ }
+
+ return ret;
+}
+
+
+static int
+__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ fd_t * fd = NULL;
+
+ int op_ret = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (__changelog_enabled (priv, local->transaction.type)) {
+ switch (local->op) {
+
+ case GF_FOP_WRITE:
+ case GF_FOP_FTRUNCATE:
+ /*
+ if it's a data transaction, we write the changelog
+ only on the first write on an fd
+ */
+
+ fd = local->fd;
+ if (!fd || __is_first_write_on_fd (this, fd))
+ op_ret = 1;
+
+ break;
+
+ case GF_FOP_FLUSH:
+ /* only do post-op on flush() */
+
+ op_ret = 0;
+ break;
+
+ default:
+ op_ret = 1;
+ }
+ }
+
+ return op_ret;
+}
+
+
+static int
+__changelog_needed_post_op (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int ret = 0;
+ afr_transaction_type type = -1;
+
+ priv = this->private;
+ local = frame->local;
+ type = local->transaction.type;
+
+ if (__changelog_enabled (priv, type)
+ && (local->op != GF_FOP_WRITE)
+ && (local->op != GF_FOP_FTRUNCATE))
+ ret = 1;
+
+ return ret;
+}
+
+
+static int
+afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)
+{
+ int ret = 0;
+
+ switch (type) {
+ case AFR_FLUSH_TRANSACTION:
+ case AFR_DATA_TRANSACTION:
+ ret = priv->data_lock_server_count;
+ break;
+
+ case AFR_METADATA_TRANSACTION:
+ ret = priv->metadata_lock_server_count;
+ break;
+
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ ret = priv->entry_lock_server_count;
+ break;
+ }
+
+ return ret;
+}
+
+
+/* {{{ unlock */
+
+int32_t
+afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local;
+ int call_count = 0;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ call_count = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (call_count == 0) {
+ local->transaction.done (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_unlock (call_frame_t *frame, xlator_t *this)
+{
+ struct flock flock;
+
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t *local = NULL;
+ afr_private_t * priv = this->private;
+
+ local = frame->local;
+
+ call_count = afr_locked_nodes_count (local->transaction.locked_nodes,
+ priv->child_count);
+
+ if (call_count == 0) {
+ local->transaction.done (frame, this);
+ return 0;
+ }
+
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)
+ call_count *= 2;
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ flock.l_start = local->transaction.start;
+ flock.l_len = local->transaction.len;
+ flock.l_type = F_UNLCK;
+
+ if (local->transaction.locked_nodes[i]) {
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ case AFR_FLUSH_TRANSACTION:
+
+ if (local->fd) {
+ STACK_WIND (frame, afr_unlock_common_cbk,
+ priv->children[i],
+ priv->children[i]->fops->finodelk,
+ local->fd, F_SETLK, &flock);
+ } else {
+ STACK_WIND (frame, afr_unlock_common_cbk,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ &local->loc, F_SETLK, &flock);
+ }
+
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+
+ STACK_WIND (frame, afr_unlock_common_cbk,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &local->transaction.new_parent_loc,
+ local->transaction.new_basename,
+ ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+
+ call_count--;
+
+ /* fall through */
+
+ case AFR_ENTRY_TRANSACTION:
+ if (local->fd) {
+ STACK_WIND (frame, afr_unlock_common_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fentrylk,
+ local->fd,
+ local->transaction.basename,
+ ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+ } else {
+ STACK_WIND (frame, afr_unlock_common_cbk,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+
+ }
+ break;
+ }
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+
+/* {{{ pending */
+
+int32_t
+afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int call_count = -1;
+
+ priv = this->private;
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ call_count = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (call_count == 0) {
+ if (afr_lock_server_count (priv, local->transaction.type) == 0) {
+ local->transaction.done (frame, this);
+ } else {
+ afr_unlock (frame, this);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = this->private;
+
+ int ret = 0;
+ int i = 0;
+ int call_count = 0;
+
+ afr_local_t * local = NULL;
+ dict_t * xattr = dict_ref (get_new_dict ());
+
+ local = frame->local;
+
+ __mark_all_success (local->pending_array, priv->child_count);
+ __mark_down_children (local->pending_array, priv->child_count, local->child_up);
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ call_count *= 2;
+ }
+
+ local->call_count = call_count;
+
+ if (call_count == 0) {
+ /* no child is up */
+ dict_unref (xattr);
+ afr_unlock (frame, this);
+ return 0;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ ret = dict_set_static_bin (xattr, local->transaction.pending,
+ local->pending_array,
+ priv->child_count * sizeof (int32_t));
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set pending entry");
+
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ case AFR_FLUSH_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ else
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ }
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ {
+ STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr);
+
+ call_count--;
+ }
+
+ /*
+ set it again because previous stack_wind
+ might have already returned (think of case
+ where subvolume is posix) and would have
+ used the dict as placeholder for return
+ value
+ */
+ ret = dict_set_static_bin (xattr, local->transaction.pending,
+ local->pending_array,
+ priv->child_count * sizeof (int32_t));
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set pending entry");
+
+ /* fall through */
+
+ case AFR_ENTRY_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ else
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ }
+ break;
+ }
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ dict_unref (xattr);
+ return 0;
+}
+
+
+int32_t
+afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = this->private;
+ loc_t * loc = NULL;
+
+ int call_count = -1;
+ int child_index = (long) cookie;
+
+ local = frame->local;
+ loc = &local->loc;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->child_up[child_index] = 0;
+
+ if (op_errno == ENOTSUP) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "xattrop not supported by %s",
+ priv->children[child_index]->name);
+ local->op_ret = -1;
+ } else if (!child_went_down (op_ret, op_errno)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "xattrop failed on child %s: %s",
+ priv->children[child_index]->name,
+ strerror (op_errno));
+ }
+ local->op_errno = op_errno;
+ }
+
+ call_count = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (call_count == 0) {
+ if ((local->op_ret == -1) &&
+ (local->op_errno == ENOTSUP)) {
+ local->transaction.resume (frame, this);
+ } else {
+ local->transaction.fop (frame, this);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = this->private;
+
+ int i = 0;
+ int ret = 0;
+ int call_count = 0;
+ dict_t *xattr = NULL;
+
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+ xattr = get_new_dict ();
+ dict_ref (xattr);
+
+ call_count = afr_up_children_count (priv->child_count,
+ local->child_up);
+
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ call_count *= 2;
+ }
+
+ if (call_count == 0) {
+ /* no child is up */
+ dict_unref (xattr);
+ afr_unlock (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ __mark_all_pending (local->pending_array, priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ ret = dict_set_static_bin (xattr,
+ local->transaction.pending,
+ local->pending_array,
+ (priv->child_count *
+ sizeof (int32_t)));
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set pending entry");
+
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ case AFR_FLUSH_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ else
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &(local->loc),
+ GF_XATTROP_ADD_ARRAY, xattr);
+ }
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ {
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr);
+
+ call_count--;
+ }
+
+
+ /*
+ set it again because previous stack_wind
+ might have already returned (think of case
+ where subvolume is posix) and would have
+ used the dict as placeholder for return
+ value
+ */
+
+ ret = dict_set_static_bin (xattr, local->transaction.pending,
+ local->pending_array,
+ priv->child_count * sizeof (int32_t));
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set pending entry");
+
+ /* fall through */
+
+ case AFR_ENTRY_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ else
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr);
+ }
+
+ break;
+ }
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ dict_unref (xattr);
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ lock */
+
+static
+int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index);
+
+int32_t
+afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ int done = 0;
+ int child_index = (long) cookie;
+
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ /* wait for the other lock to return */
+ call_count = --local->call_count;
+ }
+
+ if (op_ret == -1) {
+ if (op_errno == ENOSYS) {
+ /* return ENOTSUP */
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume does not support locking. "
+ "please load features/posix-locks xlator on server");
+ local->op_ret = op_ret;
+ done = 1;
+ }
+
+ local->child_up[child_index] = 0;
+ local->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (call_count == 0) {
+ if ((local->op_ret == -1) &&
+ (local->op_errno == ENOSYS)) {
+ afr_unlock (frame, this);
+ } else {
+ local->transaction.locked_nodes[child_index] = 1;
+ local->transaction.lock_count++;
+ afr_lock_rec (frame, this, child_index + 1);
+ }
+ }
+
+ return 0;
+}
+
+
+static loc_t *
+lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2)
+{
+ int ret = 0;
+
+ ret = strcmp (l1->path, l2->path);
+
+ if (ret == 0)
+ ret = strcmp (b1, b2);
+
+ if (ret <= 0)
+ return l1;
+ else
+ return l2;
+}
+
+
+static
+int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ struct flock flock;
+
+ loc_t * lower = NULL;
+ loc_t * higher = NULL;
+
+ const char *lower_name = NULL;
+ const char *higher_name = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ flock.l_start = local->transaction.start;
+ flock.l_len = local->transaction.len;
+ flock.l_type = F_WRLCK;
+
+ /* skip over children that are down */
+ while ((child_index < priv->child_count)
+ && !local->child_up[child_index])
+ child_index++;
+
+ if ((child_index == priv->child_count) &&
+ local->transaction.lock_count == 0) {
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unable to lock on even one child");
+
+ local->op_ret = -1;
+ local->op_errno = EAGAIN;
+
+ local->transaction.done (frame, this);
+
+ return 0;
+
+ }
+
+ if ((child_index == priv->child_count)
+ || (local->transaction.lock_count ==
+ afr_lock_server_count (priv, local->transaction.type))) {
+
+ /* we're done locking */
+
+ if (__changelog_needed_pre_op (frame, this)) {
+ afr_changelog_pre_op (frame, this);
+ } else {
+ local->transaction.fop (frame, this);
+ }
+
+ return 0;
+ }
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ case AFR_FLUSH_TRANSACTION:
+
+ if (local->fd) {
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->finodelk,
+ local->fd, F_SETLKW, &flock);
+
+ } else {
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->inodelk,
+ &local->loc, F_SETLKW, &flock);
+ }
+
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ {
+ local->call_count = 2;
+
+ lower = lower_path (&local->transaction.parent_loc,
+ local->transaction.basename,
+ &local->transaction.new_parent_loc,
+ local->transaction.new_basename);
+
+ lower_name = (lower == &local->transaction.parent_loc ?
+ local->transaction.basename :
+ local->transaction.new_basename);
+
+ higher = (lower == &local->transaction.parent_loc ?
+ &local->transaction.new_parent_loc :
+ &local->transaction.parent_loc);
+
+ higher_name = (higher == &local->transaction.parent_loc ?
+ local->transaction.basename :
+ local->transaction.new_basename);
+
+
+ /* TODO: these locks should be blocking */
+
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->entrylk,
+ lower, lower_name,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK);
+
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->entrylk,
+ higher, higher_name,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK);
+
+ break;
+ }
+
+ case AFR_ENTRY_TRANSACTION:
+ if (local->fd) {
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->fentrylk,
+ local->fd,
+ local->transaction.basename,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK);
+ } else {
+ STACK_WIND_COOKIE (frame, afr_lock_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->entrylk,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK);
+ }
+
+ break;
+ }
+
+ return 0;
+}
+
+
+int32_t afr_lock (call_frame_t *frame, xlator_t *this)
+{
+ return afr_lock_rec (frame, this, 0);
+}
+
+
+/* }}} */
+
+int32_t
+afr_transaction_resume (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (__changelog_needed_post_op (frame, this)) {
+ afr_changelog_post_op (frame, this);
+ } else {
+ if (afr_lock_server_count (priv, local->transaction.type) == 0) {
+ local->transaction.done (frame, this);
+ } else {
+ afr_unlock (frame, this);
+ }
+ }
+
+ return 0;
+}
+
+
+/**
+ * afr_transaction_child_died - inform that a child died during an fop
+ */
+
+void
+afr_transaction_child_died (call_frame_t *frame, xlator_t *this, int child_index)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ __mark_child_dead (local->pending_array, priv->child_count, child_index);
+}
+
+
+int32_t
+afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ afr_transaction_local_init (local, priv);
+
+ local->transaction.resume = afr_transaction_resume;
+ local->transaction.type = type;
+
+ if (afr_lock_server_count (priv, local->transaction.type) == 0) {
+ if (__changelog_needed_pre_op (frame, this)) {
+ afr_changelog_pre_op (frame, this);
+ } else {
+ local->transaction.fop (frame, this);
+ }
+ } else {
+ afr_lock (frame, this);
+ }
+
+ return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
new file mode 100644
index 000000000..49cdd219f
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -0,0 +1,36 @@
+/*
+ Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __TRANSACTION_H__
+#define __TRANSACTION_H__
+
+#define AFR_METADATA_PENDING "trusted.glusterfs.afr.metadata-pending"
+
+#define AFR_DATA_PENDING "trusted.glusterfs.afr.data-pending"
+
+#define AFR_ENTRY_PENDING "trusted.glusterfs.afr.entry-pending"
+
+void
+afr_transaction_child_died (call_frame_t *frame, xlator_t *this,
+ int child_index);
+
+int32_t
+afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type);
+
+#endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
new file mode 100644
index 000000000..e4c1a8479
--- /dev/null
+++ b/xlators/cluster/afr/src/afr.c
@@ -0,0 +1,2338 @@
+/*
+ Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+#include "byte-order.h"
+
+#include "afr-inode-read.h"
+#include "afr-inode-write.h"
+#include "afr-dir-read.h"
+#include "afr-dir-write.h"
+#include "afr-transaction.h"
+
+#include "afr-self-heal.h"
+
+
+/**
+ * afr_local_cleanup - cleanup everything in frame->local
+ */
+
+void
+afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)
+{
+ afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+
+ sh = &local->self_heal;
+ priv = this->private;
+
+ if (sh->buf)
+ FREE (sh->buf);
+
+ if (sh->xattr) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->xattr[i]) {
+ dict_unref (sh->xattr[i]);
+ sh->xattr[i] = NULL;
+ }
+ }
+ FREE (sh->xattr);
+ }
+
+ if (sh->child_errno)
+ FREE (sh->child_errno);
+
+ if (sh->pending_matrix) {
+ for (i = 0; i < priv->child_count; i++) {
+ FREE (sh->pending_matrix[i]);
+ }
+ FREE (sh->pending_matrix);
+ }
+
+ if (sh->delta_matrix) {
+ for (i = 0; i < priv->child_count; i++) {
+ FREE (sh->delta_matrix[i]);
+ }
+ FREE (sh->delta_matrix);
+ }
+
+ if (sh->sources)
+ FREE (sh->sources);
+
+ if (sh->success)
+ FREE (sh->success);
+
+ if (sh->healing_fd) {
+ fd_unref (sh->healing_fd);
+ sh->healing_fd = NULL;
+ }
+
+ loc_wipe (&sh->parent_loc);
+}
+
+
+void
+afr_local_cleanup (afr_local_t *local, xlator_t *this)
+{
+ if (!local)
+ return;
+
+ afr_local_sh_cleanup (local, this);
+
+ FREE (local->child_errno);
+ FREE (local->pending_array);
+
+ loc_wipe (&local->loc);
+ loc_wipe (&local->newloc);
+
+ FREE (local->transaction.locked_nodes);
+ FREE (local->transaction.child_errno);
+
+ FREE (local->transaction.basename);
+ FREE (local->transaction.new_basename);
+
+ loc_wipe (&local->transaction.parent_loc);
+ loc_wipe (&local->transaction.new_parent_loc);
+
+ if (local->fd)
+ fd_unref (local->fd);
+
+ if (local->xattr_req)
+ dict_unref (local->xattr_req);
+
+ FREE (local->child_up);
+
+ { /* lookup */
+ if (local->cont.lookup.xattr)
+ dict_unref (local->cont.lookup.xattr);
+ }
+
+ { /* getxattr */
+ if (local->cont.getxattr.name)
+ FREE (local->cont.getxattr.name);
+ }
+
+ { /* lk */
+ if (local->cont.lk.locked_nodes)
+ FREE (local->cont.lk.locked_nodes);
+ }
+
+ { /* checksum */
+ if (local->cont.checksum.file_checksum)
+ FREE (local->cont.checksum.file_checksum);
+ if (local->cont.checksum.dir_checksum)
+ FREE (local->cont.checksum.dir_checksum);
+ }
+
+ { /* create */
+ if (local->cont.create.fd)
+ fd_unref (local->cont.create.fd);
+ }
+
+ { /* writev */
+ FREE (local->cont.writev.vector);
+ }
+
+ { /* setxattr */
+ if (local->cont.setxattr.dict)
+ dict_unref (local->cont.setxattr.dict);
+ }
+
+ { /* removexattr */
+ FREE (local->cont.removexattr.name);
+ }
+
+ { /* symlink */
+ FREE (local->cont.symlink.linkpath);
+ }
+}
+
+
+int
+afr_frame_return (call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ call_count = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ return call_count;
+}
+
+/**
+ * first_up_child - return the index of the first child that is up
+ */
+
+int
+afr_first_up_child (afr_private_t *priv)
+{
+ xlator_t ** children = NULL;
+ int ret = -1;
+ int i = 0;
+
+ LOCK (&priv->lock);
+ {
+ children = priv->children;
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i]) {
+ ret = i;
+ break;
+ }
+ }
+ }
+ UNLOCK (&priv->lock);
+
+ return ret;
+}
+
+
+/**
+ * up_children_count - return the number of children that are up
+ */
+
+int
+afr_up_children_count (int child_count, unsigned char *child_up)
+{
+ int i = 0;
+ int ret = 0;
+
+ for (i = 0; i < child_count; i++)
+ if (child_up[i])
+ ret++;
+ return ret;
+}
+
+
+int
+afr_locked_nodes_count (unsigned char *locked_nodes, int child_count)
+{
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < child_count; i++)
+ if (locked_nodes[i])
+ ret++;
+
+ return ret;
+}
+
+
+ino64_t
+afr_itransform (ino64_t ino, int child_count, int child_index)
+{
+ ino64_t scaled_ino = -1;
+
+ if (ino == ((uint64_t) -1)) {
+ scaled_ino = ((uint64_t) -1);
+ goto out;
+ }
+
+ scaled_ino = (ino * child_count) + child_index;
+
+out:
+ return scaled_ino;
+}
+
+
+int
+afr_deitransform_orig (ino64_t ino, int child_count)
+{
+ int index = -1;
+
+ index = ino % child_count;
+
+ return index;
+}
+
+
+int
+afr_deitransform (ino64_t ino, int child_count)
+{
+ return 0;
+}
+
+
+int
+afr_self_heal_cbk (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ int ret = -1;
+
+ local = frame->local;
+
+ if (local->govinda_gOvinda) {
+ ret = inode_ctx_put (local->cont.lookup.inode, this, 1);
+
+ if (ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -ret;
+ }
+ } else {
+ inode_ctx_del (local->cont.lookup.inode, this, NULL);
+ }
+
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->cont.lookup.inode,
+ &local->cont.lookup.buf,
+ local->cont.lookup.xattr);
+
+ return 0;
+}
+
+
+int
+afr_lookup_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct stat *buf, dict_t *xattr)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ struct stat * lookup_buf = NULL;
+ int call_count = -1;
+ int child_index = -1;
+ int prev_child_index = -1;
+ uint32_t open_fd_count = 0;
+ int ret = 0;
+
+ child_index = (long) cookie;
+ priv = this->private;
+
+ LOCK (&frame->lock);
+ {
+ local = frame->local;
+
+ lookup_buf = &local->cont.lookup.buf;
+
+ if (op_ret == -1) {
+ if (op_errno == ENOENT)
+ local->enoent_count++;
+
+ if (op_errno != ENOTCONN)
+ local->op_errno = op_errno;
+
+ goto unlock;
+ }
+
+ if (afr_sh_has_metadata_pending (xattr, child_index, this))
+ local->need_metadata_self_heal = 1;
+
+ if (afr_sh_has_entry_pending (xattr, child_index, this))
+ local->need_entry_self_heal = 1;
+
+ if (afr_sh_has_data_pending (xattr, child_index, this))
+ local->need_data_self_heal = 1;
+
+ ret = dict_get_uint32 (xattr, GLUSTERFS_OPEN_FD_COUNT,
+ &open_fd_count);
+ local->open_fd_count += open_fd_count;
+
+ /* in case of revalidate, we need to send stat of the
+ * child whose stat was sent during the first lookup.
+ * (so that time stamp does not vary with revalidate.
+ * in case it is down, stat of the fist success will
+ * be replied */
+
+ /* inode number should be preserved across revalidates */
+
+ if (local->success_count == 0) {
+ local->op_ret = op_ret;
+
+ local->cont.lookup.inode = inode;
+ local->cont.lookup.xattr = dict_ref (xattr);
+
+ *lookup_buf = *buf;
+ lookup_buf->st_ino = afr_itransform (buf->st_ino,
+ priv->child_count,
+ child_index);
+ } else {
+ if (FILETYPE_DIFFERS (buf, lookup_buf)) {
+ /* mismatching filetypes with same name
+ -- Govinda !! GOvinda !!!
+ */
+ local->govinda_gOvinda = 1;
+ }
+
+ if (PERMISSION_DIFFERS (buf, lookup_buf)) {
+ /* mismatching permissions */
+ local->need_metadata_self_heal = 1;
+ }
+
+ if (OWNERSHIP_DIFFERS (buf, lookup_buf)) {
+ /* mismatching permissions */
+ local->need_metadata_self_heal = 1;
+ }
+
+ if (SIZE_DIFFERS (buf, lookup_buf)
+ && S_ISREG (buf->st_mode)) {
+ local->need_data_self_heal = 1;
+ }
+
+ prev_child_index = afr_deitransform_orig (lookup_buf->st_ino,
+ priv->child_count);
+ if (child_index < prev_child_index) {
+ *lookup_buf = *buf;
+ lookup_buf->st_ino = afr_itransform (buf->st_ino,
+ priv->child_count,
+ child_index);
+ }
+ }
+
+ local->success_count++;
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (local->op_ret == 0) {
+ /* KLUDGE: assuming DHT will not itransform in
+ revalidate */
+ if (local->cont.lookup.inode->ino)
+ lookup_buf->st_ino =
+ local->cont.lookup.inode->ino;
+ }
+
+ if (local->success_count && local->enoent_count) {
+ local->need_metadata_self_heal = 1;
+ local->need_data_self_heal = 1;
+ local->need_entry_self_heal = 1;
+ }
+
+ if (local->success_count) {
+ /* check for govinda_gOvinda case in previous lookup */
+ if (!inode_ctx_get (local->cont.lookup.inode,
+ this, NULL))
+ local->need_data_self_heal = 1;
+ }
+
+ if ((local->need_metadata_self_heal
+ || local->need_data_self_heal
+ || local->need_entry_self_heal)
+ && (!local->open_fd_count)) {
+
+ if (!local->cont.lookup.inode->st_mode) {
+ /* fix for RT #602 */
+ local->cont.lookup.inode->st_mode =
+ lookup_buf->st_mode;
+ }
+
+ afr_self_heal (frame, this, afr_self_heal_cbk);
+ } else {
+ AFR_STACK_UNWIND (frame, local->op_ret,
+ local->op_errno,
+ local->cont.lookup.inode,
+ &local->cont.lookup.buf,
+ local->cont.lookup.xattr);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_lookup (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *xattr_req)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int i = 0;
+ int32_t op_errno = 0;
+
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ local->op_ret = -1;
+
+ frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->reval_child_index = 0;
+
+ local->call_count = priv->child_count;
+
+ local->child_up = memdup (priv->child_up, priv->child_count);
+ local->child_count = afr_up_children_count (priv->child_count,
+ local->child_up);
+
+ /* By default assume ENOTCONN. On success it will be set to 0. */
+ local->op_errno = ENOTCONN;
+
+ if ((xattr_req == NULL)
+ && (priv->metadata_self_heal
+ || priv->data_self_heal
+ || priv->entry_self_heal))
+ local->xattr_req = dict_new ();
+ else
+ local->xattr_req = dict_ref (xattr_req);
+
+ if (priv->metadata_self_heal) {
+ ret = dict_set_uint64 (local->xattr_req, AFR_METADATA_PENDING,
+ priv->child_count * sizeof(int32_t));
+ }
+
+ if (priv->data_self_heal) {
+ ret = dict_set_uint64 (local->xattr_req, AFR_DATA_PENDING,
+ priv->child_count * sizeof(int32_t));
+ }
+
+ if (priv->entry_self_heal) {
+ ret = dict_set_uint64 (local->xattr_req, AFR_ENTRY_PENDING,
+ priv->child_count * sizeof(int32_t));
+ }
+
+ ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 0);
+
+ for (i = 0; i < priv->child_count; i++) {
+ STACK_WIND_COOKIE (frame, afr_lookup_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ loc, local->xattr_req);
+ }
+
+ ret = 0;
+out:
+ if (ret == -1)
+ AFR_STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+/* {{{ open */
+
+int
+afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_local_t * local = frame->local;
+
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->fd);
+ return 0;
+}
+
+
+int
+afr_open_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ fd_t *fd)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int call_count = -1;
+
+ priv = this->private;
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ }
+
+ if (op_ret >= 0) {
+ local->op_ret = op_ret;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if ((local->cont.open.flags & O_TRUNC)
+ && (local->op_ret >= 0)) {
+ STACK_WIND (frame, afr_open_ftruncate_cbk,
+ this, this->fops->ftruncate,
+ fd, 0);
+ } else {
+ AFR_STACK_UNWIND (frame, local->op_ret,
+ local->op_errno, local->fd);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_open (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int i = 0;
+ int ret = -1;
+
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ int32_t wind_flags = flags & (~O_TRUNC);
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ priv = this->private;
+
+ ret = inode_ctx_get (loc->inode, this, NULL);
+ if (ret == 0) {
+ /* if ctx is set it means self-heal failed */
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "returning EIO, file has to be manually corrected "
+ "in backend");
+ op_errno = EIO;
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+ call_count = local->call_count;
+
+ local->cont.open.flags = flags;
+ local->fd = fd_ref (fd);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->open,
+ loc, wind_flags, fd);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, fd);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ flush */
+
+int
+afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t * local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_flush_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int i = 0;
+ int call_count = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_flush_wind_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->flush,
+ local->fd);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_flush_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+afr_simple_flush_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+static int
+__is_fd_ctx_set (xlator_t *this, fd_t *fd)
+{
+ int _ret = 0;
+ int op_ret = 0;
+
+ _ret = fd_ctx_get (fd, this, NULL);
+ if (_ret == 0)
+ op_ret = 1;
+
+ return op_ret;
+}
+
+
+int
+afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int ret = -1;
+ int i = 0;
+ int call_count = 0;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+
+ if (__is_fd_ctx_set (this, fd)) {
+ local->op = GF_FOP_FLUSH;
+ local->transaction.fop = afr_flush_wind;
+ local->transaction.done = afr_flush_done;
+
+ local->fd = fd_ref (fd);
+
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+
+ local->transaction.pending = AFR_DATA_PENDING;
+
+ afr_transaction (frame, this, AFR_FLUSH_TRANSACTION);
+ } else {
+ /*
+ * if fd's ctx is not set, then there is no need
+ * to erase changelog. So just send the flush
+ */
+
+ call_count = local->call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_simple_flush_cbk,
+ priv->children[i],
+ priv->children[i]->fops->flush,
+ fd);
+
+ if (!--call_count)
+ break;
+ }
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ fsync */
+
+int
+afr_fsync_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int
+afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t datasync)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_fsync_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fsync,
+ fd, datasync);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ fsync */
+
+int32_t
+afr_fsyncdir_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t datasync)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_fsync_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fsyncdir,
+ fd, datasync);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ xattrop */
+
+int32_t
+afr_xattrop_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr);
+
+ return 0;
+}
+
+
+int32_t
+afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t optype, dict_t *xattr)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_xattrop_cbk,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ loc, optype, xattr);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ fxattrop */
+
+int32_t
+afr_fxattrop_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *xattr)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr);
+
+ return 0;
+}
+
+
+int32_t
+afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_fxattrop_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ fd, optype, xattr);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+/* }}} */
+
+
+int32_t
+afr_inodelk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_inodelk (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int32_t cmd, struct flock *flock)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_inodelk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ loc, cmd, flock);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_finodelk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_finodelk (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t cmd, struct flock *flock)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_finodelk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->finodelk,
+ fd, cmd, flock);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_entrylk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *basename, entrylk_cmd cmd, entrylk_type type)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_entrylk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ loc, basename, cmd, type);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+
+int32_t
+afr_fentrylk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno)
+
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+
+ return 0;
+}
+
+
+int32_t
+afr_fentrylk (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_fentrylk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fentrylk,
+ fd, basename, cmd, type);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_checksum_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ uint8_t *file_checksum, uint8_t *dir_checksum)
+
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0 && (local->op_ret != 0)) {
+ local->op_ret = 0;
+
+ local->cont.checksum.file_checksum = MALLOC (ZR_FILENAME_MAX);
+ memcpy (local->cont.checksum.file_checksum, file_checksum,
+ ZR_FILENAME_MAX);
+
+ local->cont.checksum.dir_checksum = MALLOC (ZR_FILENAME_MAX);
+ memcpy (local->cont.checksum.dir_checksum, dir_checksum,
+ ZR_FILENAME_MAX);
+
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ local->cont.checksum.file_checksum,
+ local->cont.checksum.dir_checksum);
+
+ return 0;
+}
+
+
+int32_t
+afr_checksum (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int32_t flag)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = -1;
+
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ call_count = local->call_count;
+ frame->local = local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_checksum_cbk,
+ priv->children[i],
+ priv->children[i]->fops->checksum,
+ loc, flag);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_statfs_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct statvfs *statvfs)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = 0;
+
+ LOCK (&frame->lock);
+ {
+ local = frame->local;
+
+ if (op_ret == 0) {
+ local->op_ret = op_ret;
+
+ if (local->cont.statfs.buf_set) {
+ if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail)
+ local->cont.statfs.buf = *statvfs;
+ } else {
+ local->cont.statfs.buf = *statvfs;
+ local->cont.statfs.buf_set = 1;
+ }
+ }
+
+ if (op_ret == -1)
+ local->op_errno = op_errno;
+
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->cont.statfs.buf);
+
+ return 0;
+}
+
+
+int32_t
+afr_statfs (call_frame_t *frame, xlator_t *this,
+ loc_t *loc)
+{
+ afr_private_t * priv = NULL;
+ int child_count = 0;
+ afr_local_t * local = NULL;
+ int i = 0;
+
+ int ret = -1;
+ int call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ priv = this->private;
+ child_count = priv->child_count;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+ call_count = local->call_count;
+
+ for (i = 0; i < child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_statfs_cbk,
+ priv->children[i],
+ priv->children[i]->fops->statfs,
+ loc);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+ return 0;
+}
+
+
+int32_t
+afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct flock *lock)
+{
+ afr_local_t * local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ lock);
+
+ return 0;
+}
+
+
+int32_t
+afr_lk_unlock (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int i;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes,
+ priv->child_count);
+
+ if (call_count == 0) {
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->cont.lk.flock);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ local->cont.lk.flock.l_type = F_UNLCK;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->cont.lk.locked_nodes[i]) {
+ STACK_WIND (frame, afr_lk_unlock_cbk,
+ priv->children[i],
+ priv->children[i]->fops->lk,
+ local->fd, F_SETLK,
+ &local->cont.lk.flock);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct flock *lock)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ int call_count = -1;
+ int child_index = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ call_count = --local->call_count;
+
+ if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+
+ afr_lk_unlock (frame, this);
+ return 0;
+ }
+
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->cont.lk.flock = *lock;
+ local->cont.lk.locked_nodes[child_index] = 1;
+ }
+
+ child_index++;
+
+ if (child_index < priv->child_count) {
+ STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->lk,
+ local->fd, local->cont.lk.cmd,
+ &local->cont.lk.flock);
+ } else if (local->op_ret == -1) {
+ /* all nodes have gone down */
+
+ AFR_STACK_UNWIND (frame, -1, ENOTCONN, &local->cont.lk.flock);
+ } else {
+ /* locking has succeeded on all nodes that are up */
+
+ AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ &local->cont.lk.flock);
+ }
+
+ return 0;
+}
+
+
+int
+afr_lk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t cmd,
+ struct flock *flock)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int i = 0;
+
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+ AFR_LOCAL_INIT (local, priv);
+
+ frame->local = local;
+
+ local->cont.lk.locked_nodes = CALLOC (priv->child_count,
+ sizeof (*local->cont.lk.locked_nodes));
+
+ if (!local->cont.lk.locked_nodes) {
+ gf_log (this->name, GF_LOG_ERROR, "out of memory :(");
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->fd = fd_ref (fd);
+ local->cont.lk.cmd = cmd;
+ local->cont.lk.flock = *flock;
+
+ STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0,
+ priv->children[i],
+ priv->children[i]->fops->lk,
+ fd, cmd, flock);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ }
+ return 0;
+}
+
+
+/**
+ * find_child_index - find the child's index in the array of subvolumes
+ * @this: AFR
+ * @child: child
+ */
+
+static int
+find_child_index (xlator_t *this, xlator_t *child)
+{
+ afr_private_t *priv = NULL;
+
+ int i = -1;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((xlator_t *) child == priv->children[i])
+ break;
+ }
+
+ return i;
+}
+
+
+int32_t
+notify (xlator_t *this, int32_t event,
+ void *data, ...)
+{
+ afr_private_t * priv = NULL;
+ unsigned char * child_up = NULL;
+
+ int i = -1;
+ int up_children = 0;
+
+ priv = this->private;
+
+ if (!priv)
+ return 0;
+
+ child_up = priv->child_up;
+
+ switch (event) {
+ case GF_EVENT_CHILD_UP:
+ i = find_child_index (this, data);
+
+ child_up[i] = 1;
+
+ /*
+ if all the children were down, and one child came up,
+ send notify to parent
+ */
+
+ for (i = 0; i < priv->child_count; i++)
+ if (child_up[i])
+ up_children++;
+
+ if (up_children == 1)
+ default_notify (this, event, data);
+
+ break;
+
+ case GF_EVENT_CHILD_DOWN:
+ i = find_child_index (this, data);
+
+ child_up[i] = 0;
+
+ /*
+ if all children are down, and this was the last to go down,
+ send notify to parent
+ */
+
+ for (i = 0; i < priv->child_count; i++)
+ if (child_up[i])
+ up_children++;
+
+ if (up_children == 0)
+ default_notify (this, event, data);
+
+ break;
+
+ default:
+ default_notify (this, event, data);
+ }
+
+ return 0;
+}
+
+
+static const char *favorite_child_warning_str = "You have specified subvolume '%s' "
+ "as the 'favorite child'. This means that if a discrepancy in the content "
+ "or attributes (ownership, permission, etc.) of a file is detected among "
+ "the subvolumes, the file on '%s' will be considered the definitive "
+ "version and its contents will OVERWRITE the contents of the file on other "
+ "subvolumes. All versions of the file except that on '%s' "
+ "WILL BE LOST.";
+
+static const char *no_lock_servers_warning_str = "You have set lock-server-count = 0. "
+ "This means correctness is NO LONGER GUARANTEED in all cases. If two or more "
+ "applications write to the same region of a file, there is a possibility that "
+ "its copies will be INCONSISTENT. Set it to a value greater than 0 unless you "
+ "are ABSOLUTELY SURE of what you are doing and WILL NOT HOLD GlusterFS "
+ "RESPOSIBLE for inconsistent data. If you are in doubt, set it to a value "
+ "greater than 0.";
+
+int32_t
+init (xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ int child_count = 0;
+ xlator_list_t * trav = NULL;
+ int i = 0;
+ int ret = -1;
+ int op_errno = 0;
+
+ char * read_subvol = NULL;
+ char * fav_child = NULL;
+ char * self_heal = NULL;
+ char * change_log = NULL;
+
+ int32_t lock_server_count = 1;
+
+ int fav_ret = -1;
+ int read_ret = -1;
+ int dict_ret = -1;
+
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "AFR needs more than one child defined");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ ALLOC_OR_GOTO (this->private, afr_private_t, out);
+
+ priv = this->private;
+
+ read_ret = dict_get_str (this->options, "read-subvolume", &read_subvol);
+ priv->read_child = -1;
+
+ fav_ret = dict_get_str (this->options, "favorite-child", &fav_child);
+ priv->favorite_child = -1;
+
+ /* Default values */
+
+ priv->data_self_heal = 1;
+ priv->metadata_self_heal = 1;
+ priv->entry_self_heal = 1;
+
+ dict_ret = dict_get_str (this->options, "data-self-heal", &self_heal);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (self_heal, &priv->data_self_heal);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option data-self-heal %s' "
+ "defaulting to data-self-heal as 'on'",
+ self_heal);
+ priv->data_self_heal = 1;
+ }
+ }
+
+ dict_ret = dict_get_str (this->options, "metadata-self-heal",
+ &self_heal);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (self_heal, &priv->metadata_self_heal);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option metadata-self-heal %s' "
+ "defaulting to metadata-self-heal as 'on'",
+ self_heal);
+ priv->metadata_self_heal = 1;
+ }
+ }
+
+ dict_ret = dict_get_str (this->options, "entry-self-heal", &self_heal);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (self_heal, &priv->entry_self_heal);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option entry-self-heal %s' "
+ "defaulting to entry-self-heal as 'on'",
+ self_heal);
+ priv->entry_self_heal = 1;
+ }
+ }
+
+ /* Change log options */
+
+ priv->data_change_log = 1;
+ priv->metadata_change_log = 0;
+ priv->entry_change_log = 1;
+
+ dict_ret = dict_get_str (this->options, "data-change-log",
+ &change_log);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (change_log, &priv->data_change_log);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option data-change-log %s'. "
+ "defaulting to data-change-log as 'on'",
+ change_log);
+ priv->data_change_log = 1;
+ }
+ }
+
+ dict_ret = dict_get_str (this->options, "metadata-change-log",
+ &change_log);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (change_log,
+ &priv->metadata_change_log);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option metadata-change-log %s'. "
+ "defaulting to metadata-change-log as 'off'",
+ change_log);
+ priv->metadata_change_log = 0;
+ }
+ }
+
+ dict_ret = dict_get_str (this->options, "entry-change-log",
+ &change_log);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (change_log, &priv->entry_change_log);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid 'option entry-change-log %s'. "
+ "defaulting to entry-change-log as 'on'",
+ change_log);
+ priv->entry_change_log = 1;
+ }
+ }
+
+ /* Locking options */
+
+ priv->data_lock_server_count = 1;
+ priv->metadata_lock_server_count = 0;
+ priv->entry_lock_server_count = 1;
+
+ dict_ret = dict_get_int32 (this->options, "data-lock-server-count",
+ &lock_server_count);
+ if (dict_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting data lock server count to %d",
+ lock_server_count);
+
+ if (lock_server_count == 0)
+ gf_log (this->name, GF_LOG_WARNING,
+ no_lock_servers_warning_str);
+
+ priv->data_lock_server_count = lock_server_count;
+ }
+
+
+ dict_ret = dict_get_int32 (this->options,
+ "metadata-lock-server-count",
+ &lock_server_count);
+ if (dict_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting metadata lock server count to %d",
+ lock_server_count);
+ priv->metadata_lock_server_count = lock_server_count;
+ }
+
+
+ dict_ret = dict_get_int32 (this->options, "entry-lock-server-count",
+ &lock_server_count);
+ if (dict_ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting entry lock server count to %d",
+ lock_server_count);
+
+ priv->entry_lock_server_count = lock_server_count;
+ }
+
+
+ trav = this->children;
+ while (trav) {
+ if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume '%s' specified as read child",
+ trav->xlator->name);
+
+ priv->read_child = child_count;
+ }
+
+ if (fav_ret == 0 && !strcmp (fav_child, trav->xlator->name)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ favorite_child_warning_str, trav->xlator->name,
+ trav->xlator->name, trav->xlator->name);
+ priv->favorite_child = child_count;
+ }
+
+ child_count++;
+ trav = trav->next;
+ }
+
+ /* XXX: return inode numbers from 1st subvolume till
+ afr supports read-subvolume based on inode's ctx
+ (and not itransform) for this reason afr_deitransform()
+ returns 0 always
+ */
+ priv->read_child = 0;
+
+ priv->wait_count = 1;
+
+ priv->child_count = child_count;
+ LOCK_INIT (&priv->lock);
+
+ priv->child_up = CALLOC (sizeof (unsigned char), child_count);
+ if (!priv->child_up) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ priv->children = CALLOC (sizeof (xlator_t *), child_count);
+ if (!priv->children) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ trav = this->children;
+ i = 0;
+ while (i < child_count) {
+ priv->children[i] = trav->xlator;
+
+ trav = trav->next;
+ i++;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+int
+fini (xlator_t *this)
+{
+ return 0;
+}
+
+
+struct xlator_fops fops = {
+ .lookup = afr_lookup,
+ .open = afr_open,
+ .lk = afr_lk,
+ .flush = afr_flush,
+ .statfs = afr_statfs,
+ .fsync = afr_fsync,
+ .fsyncdir = afr_fsyncdir,
+ .xattrop = afr_xattrop,
+ .fxattrop = afr_fxattrop,
+ .inodelk = afr_inodelk,
+ .finodelk = afr_finodelk,
+ .entrylk = afr_entrylk,
+ .fentrylk = afr_fentrylk,
+ .checksum = afr_checksum,
+
+ /* inode read */
+ .access = afr_access,
+ .stat = afr_stat,
+ .fstat = afr_fstat,
+ .readlink = afr_readlink,
+ .getxattr = afr_getxattr,
+ .readv = afr_readv,
+
+ /* inode write */
+ .chmod = afr_chmod,
+ .chown = afr_chown,
+ .fchmod = afr_fchmod,
+ .fchown = afr_fchown,
+ .writev = afr_writev,
+ .truncate = afr_truncate,
+ .ftruncate = afr_ftruncate,
+ .utimens = afr_utimens,
+ .setxattr = afr_setxattr,
+ .removexattr = afr_removexattr,
+
+ /* dir read */
+ .opendir = afr_opendir,
+ .readdir = afr_readdir,
+ .getdents = afr_getdents,
+
+ /* dir write */
+ .create = afr_create,
+ .mknod = afr_mknod,
+ .mkdir = afr_mkdir,
+ .unlink = afr_unlink,
+ .rmdir = afr_rmdir,
+ .link = afr_link,
+ .symlink = afr_symlink,
+ .rename = afr_rename,
+ .setdents = afr_setdents,
+};
+
+
+struct xlator_mops mops = {
+};
+
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {"read-subvolume" },
+ .type = GF_OPTION_TYPE_XLATOR
+ },
+ { .key = {"favorite-child"},
+ .type = GF_OPTION_TYPE_XLATOR
+ },
+ { .key = {"data-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"metadata-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"entry-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"data-change-log"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"metadata-change-log"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"entry-change-log"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"data-lock-server-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0
+ },
+ { .key = {"metadata-lock-server-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0
+ },
+ { .key = {"entry-lock-server-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
new file mode 100644
index 000000000..4cf6cdf9d
--- /dev/null
+++ b/xlators/cluster/afr/src/afr.h
@@ -0,0 +1,523 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+#ifndef __AFR_H__
+#define __AFR_H__
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "scheduler.h"
+#include "call-stub.h"
+#include "compat-errno.h"
+
+
+typedef struct _afr_private {
+ gf_lock_t lock; /* to guard access to child_count, etc */
+ unsigned int child_count; /* total number of children */
+
+ xlator_t **children;
+
+ unsigned char *child_up;
+
+ gf_boolean_t data_self_heal; /* on/off */
+ gf_boolean_t metadata_self_heal; /* on/off */
+ gf_boolean_t entry_self_heal; /* on/off */
+
+
+ gf_boolean_t data_change_log; /* on/off */
+ gf_boolean_t metadata_change_log; /* on/off */
+ gf_boolean_t entry_change_log; /* on/off */
+
+ unsigned int read_child; /* read-subvolume */
+ unsigned int favorite_child; /* subvolume to be preferred in resolving
+ split-brain cases */
+
+ unsigned int data_lock_server_count;
+ unsigned int metadata_lock_server_count;
+ unsigned int entry_lock_server_count;
+
+ unsigned int wait_count; /* # of servers to wait for success */
+} afr_private_t;
+
+typedef struct {
+ /* array of stat's, one for each child */
+ struct stat *buf;
+
+ /* array of xattr's, one for each child */
+ dict_t **xattr;
+
+ /* array of errno's, one for each child */
+ int *child_errno;
+
+ int32_t **pending_matrix;
+ int32_t **delta_matrix;
+
+ int *sources;
+ int source;
+ int active_source;
+ int active_sinks;
+ int *success;
+
+ fd_t *healing_fd;
+ int op_failed;
+
+ int file_has_holes;
+ blksize_t block_size;
+ off_t file_size;
+ off_t offset;
+
+ loc_t parent_loc;
+ int (*completion_cbk) (call_frame_t *frame, xlator_t *this);
+ call_frame_t *sh_frame;
+} afr_self_heal_t;
+
+
+typedef enum {
+ AFR_DATA_TRANSACTION, /* truncate, write, ... */
+ AFR_METADATA_TRANSACTION, /* chmod, chown, ... */
+ AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */
+ AFR_ENTRY_RENAME_TRANSACTION, /* rename */
+ AFR_FLUSH_TRANSACTION, /* flush */
+} afr_transaction_type;
+
+typedef struct _afr_local {
+ unsigned int call_count;
+ unsigned int success_count;
+ unsigned int enoent_count;
+
+ unsigned int need_metadata_self_heal;
+ unsigned int need_entry_self_heal;
+ unsigned int need_data_self_heal;
+ unsigned int govinda_gOvinda;
+
+ unsigned int reval_child_index;
+ int32_t op_ret;
+ int32_t op_errno;
+
+ int32_t *pending_array;
+
+ loc_t loc;
+ loc_t newloc;
+
+ fd_t *fd;
+
+ glusterfs_fop_t fop;
+
+ unsigned char *child_up;
+ int child_count;
+
+ int32_t *child_errno;
+
+ dict_t *xattr_req;
+ int open_fd_count;
+ /*
+ This struct contains the arguments for the "continuation"
+ (scheme-like) of fops
+ */
+
+ int op;
+ struct {
+ struct {
+ unsigned char buf_set;
+ struct statvfs buf;
+ } statfs;
+
+ struct {
+ inode_t *inode;
+ struct stat buf;
+ dict_t *xattr;
+ } lookup;
+
+ struct {
+ int32_t flags;
+ } open;
+
+ struct {
+ int32_t cmd;
+ struct flock flock;
+ unsigned char *locked_nodes;
+ } lk;
+
+ struct {
+ uint8_t *file_checksum;
+ uint8_t *dir_checksum;
+ } checksum;
+
+ /* inode read */
+
+ struct {
+ int32_t mask;
+ int last_tried; /* index of the child we tried previously */
+ } access;
+
+ struct {
+ int last_tried;
+ ino_t ino;
+ } stat;
+
+ struct {
+ int last_tried;
+ ino_t ino;
+ } fstat;
+
+ struct {
+ size_t size;
+ int last_tried;
+ } readlink;
+
+ struct {
+ const char *name;
+ int last_tried;
+ } getxattr;
+
+ struct {
+ size_t size;
+ off_t offset;
+ int last_tried;
+ } readv;
+
+ /* dir read */
+
+ struct {
+ int success_count;
+ int32_t op_ret;
+ int32_t op_errno;
+ } opendir;
+
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+ size_t size;
+ off_t offset;
+
+ int last_tried;
+ } readdir;
+
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+
+ size_t size;
+ off_t offset;
+ int32_t flag;
+
+ int last_tried;
+ } getdents;
+
+ /* inode write */
+
+ struct {
+ ino_t ino;
+ mode_t mode;
+ struct stat buf;
+ } chmod;
+
+ struct {
+ ino_t ino;
+ mode_t mode;
+ struct stat buf;
+ } fchmod;
+
+ struct {
+ ino_t ino;
+ uid_t uid;
+ gid_t gid;
+ struct stat buf;
+ } chown;
+
+ struct {
+ ino_t ino;
+ uid_t uid;
+ gid_t gid;
+ struct stat buf;
+ } fchown;
+
+ struct {
+ ino_t ino;
+ struct stat buf;
+
+ int32_t op_ret;
+
+ struct iovec *vector;
+ dict_t *refs;
+ int32_t count;
+ off_t offset;
+ } writev;
+
+ struct {
+ ino_t ino;
+ off_t offset;
+ struct stat buf;
+ } truncate;
+
+ struct {
+ ino_t ino;
+ off_t offset;
+ struct stat buf;
+ } ftruncate;
+
+ struct {
+ ino_t ino;
+ struct timespec tv[2];
+ struct stat buf;
+ } utimens;
+
+ struct {
+ dict_t *dict;
+ int32_t flags;
+ } setxattr;
+
+ struct {
+ const char *name;
+ } removexattr;
+
+ /* dir write */
+
+ struct {
+ ino_t ino;
+ fd_t *fd;
+ int32_t flags;
+ mode_t mode;
+ inode_t *inode;
+ struct stat buf;
+ } create;
+
+ struct {
+ ino_t ino;
+ dev_t dev;
+ mode_t mode;
+ inode_t *inode;
+ struct stat buf;
+ } mknod;
+
+ struct {
+ ino_t ino;
+ int32_t mode;
+ inode_t *inode;
+ struct stat buf;
+ } mkdir;
+
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+ } unlink;
+
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+ } rmdir;
+
+ struct {
+ ino_t ino;
+ struct stat buf;
+ } rename;
+
+ struct {
+ ino_t ino;
+ inode_t *inode;
+ struct stat buf;
+ } link;
+
+ struct {
+ ino_t ino;
+ inode_t *inode;
+ struct stat buf;
+ char *linkpath;
+ } symlink;
+
+ struct {
+ int32_t flags;
+ dir_entry_t *entries;
+ int32_t count;
+ } setdents;
+ } cont;
+
+ struct {
+ off_t start, len;
+
+ unsigned char *locked_nodes;
+ int lock_count;
+
+ const char *basename;
+ const char *new_basename;
+
+ char *pending;
+
+ loc_t parent_loc;
+ loc_t new_parent_loc;
+
+ afr_transaction_type type;
+
+ int success_count;
+ int erase_pending;
+ int failure_count;
+
+ int last_tried;
+ int32_t *child_errno;
+
+ call_frame_t *main_frame;
+
+ int (*fop) (call_frame_t *frame, xlator_t *this);
+
+ int (*done) (call_frame_t *frame, xlator_t *this);
+
+ int (*resume) (call_frame_t *frame, xlator_t *this);
+
+ int (*unwind) (call_frame_t *frame, xlator_t *this);
+ } transaction;
+
+ afr_self_heal_t self_heal;
+} afr_local_t;
+
+/* try alloc and if it fails, goto label */
+#define ALLOC_OR_GOTO(var, type, label) do { \
+ var = CALLOC (sizeof (type), 1); \
+ if (!var) { \
+ gf_log (this->name, GF_LOG_ERROR, \
+ "out of memory :("); \
+ op_errno = ENOMEM; \
+ goto label; \
+ } \
+ } while (0);
+
+
+/* did a call fail due to a child failing? */
+#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
+ ((op_errno == ENOTCONN) || \
+ (op_errno == EBADFD)))
+
+/* have we tried all children? */
+#define all_tried(i, count) ((i) == (count) - 1)
+
+void
+afr_build_parent_loc (loc_t *parent, loc_t *child);
+
+int
+afr_up_children_count (int child_count, unsigned char *child_up);
+
+int
+afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
+
+int
+afr_first_up_child (afr_private_t *priv);
+
+ino64_t
+afr_itransform (ino64_t ino, int child_count, int child_index);
+
+int
+afr_deitransform (ino64_t ino, int child_count);
+
+void
+afr_local_cleanup (afr_local_t *local, xlator_t *this);
+
+int
+afr_frame_return (call_frame_t *frame);
+
+#define AFR_STACK_UNWIND(frame, params ...) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ STACK_UNWIND (frame, params); \
+ afr_local_cleanup (__local, __this); \
+ free (__local); \
+} while (0);
+
+#define AFR_STACK_DESTROY(frame) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ STACK_DESTROY (frame->root); \
+ afr_local_cleanup (__local, __this); \
+ free (__local); \
+} while (0);
+
+/* allocate and return a string that is the basename of argument */
+static inline char *
+AFR_BASENAME (const char *str)
+{
+ char *__tmp_str = NULL;
+ char *__basename_str = NULL;
+ __tmp_str = strdup (str);
+ __basename_str = strdup (basename (__tmp_str));
+ FREE (__tmp_str);
+ return __basename_str;
+}
+
+/* initialize local_t */
+static inline int
+AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv)
+{
+ local->child_up = CALLOC (sizeof (*local->child_up),
+ priv->child_count);
+ if (!local->child_up) {
+ return -ENOMEM;
+ }
+
+ memcpy (local->child_up, priv->child_up,
+ sizeof (*local->child_up) * priv->child_count);
+
+
+ local->call_count = afr_up_children_count (priv->child_count, local->child_up);
+ if (local->call_count == 0)
+ return -ENOTCONN;
+
+ local->transaction.erase_pending = 1;
+
+ local->op_ret = -1;
+ local->op_errno = EUCLEAN;
+
+ return 0;
+}
+
+
+static inline int
+afr_transaction_local_init (afr_local_t *local, afr_private_t *priv)
+{
+ local->child_errno = CALLOC (sizeof (*local->child_errno),
+ priv->child_count);
+ if (!local->child_errno) {
+ return -ENOMEM;
+ }
+
+ local->pending_array = CALLOC (sizeof (*local->pending_array),
+ priv->child_count);
+ if (!local->pending_array) {
+ return -ENOMEM;
+ }
+
+ local->transaction.locked_nodes = CALLOC (sizeof (*local->transaction.locked_nodes),
+ priv->child_count);
+
+ local->transaction.child_errno = CALLOC (sizeof (*local->transaction.child_errno),
+ priv->child_count);
+
+ return 0;
+}
+
+#endif /* __AFR_H__ */