summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVikas Gorur <vikas@gluster.com>2009-11-24 08:45:09 +0000
committerAnand V. Avati <avati@dev.gluster.com>2009-11-24 06:40:08 -0800
commit74612a456ad1602f8038fae79fee654eb427602a (patch)
treea8c57ae1b5919688faa00985aad3677e0df9ea1b
parent218959e0597b16755a98b19786ed6a42cd15cbc4 (diff)
cluster/afr: Do self-heal on reopened fds.
This patch brings in partial support for self-heal of open fds. The precondition is that the fd should have been opened successfully during the initial open() (or create()), and we assume that protocol/client has successfully reopened the fd when the subvolume comes back up. It works by doing an "up/down flush" (a dummy flush transaction to do post-op wherever necessary) and then triggering data self-heal on the file in the post-post-op hook of the dummy flush transaction. This ensures that any writes that come in during self-heal will wait until self-heal completes. The up/down flush is also done when a subvolume goes down, so that post-op is done on all subvolumes where pre-op was done. Signed-off-by: Vikas Gorur <vikas@gluster.com> Signed-off-by: Anand V. Avati <avati@dev.gluster.com> BUG: 170 (Auto-heal fails on files that are open()-ed/mmap()-ed) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=170
-rw-r--r--xlators/cluster/afr/src/Makefile.am2
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c3
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c179
-rw-r--r--xlators/cluster/afr/src/afr-open.c356
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c49
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c2
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h3
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c130
-rw-r--r--xlators/cluster/afr/src/afr.c174
-rw-r--r--xlators/cluster/afr/src/afr.h30
10 files changed, 699 insertions, 229 deletions
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am
index df284d12c..1a8ddadb7 100644
--- a/xlators/cluster/afr/src/Makefile.am
+++ b/xlators/cluster/afr/src/Makefile.am
@@ -3,7 +3,7 @@ xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
afr_la_LDFLAGS = -module -avoidversion
-afr_la_SOURCES = afr.c afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c afr-self-heal-algorithm.c
+afr_la_SOURCES = afr.c afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c afr-self-heal-algorithm.c
afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h afr-self-heal-algorithm.h
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index ed052589c..ee80963b7 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -165,7 +165,8 @@ out:
local->loc.path);
afr_self_heal (frame, this,
- afr_examine_dir_completion_cbk);
+ afr_examine_dir_completion_cbk,
+ _gf_true);
} else {
afr_set_opendir_done (this, local->fd->inode);
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 7dcc06708..5f35aa26f 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -205,32 +205,79 @@ afr_writev_done (call_frame_t *frame, xlator_t *this)
int
+afr_do_writev (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t * transaction_frame = NULL;
+ afr_local_t * local = NULL;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ local = frame->local;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Out of memory.");
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ transaction_frame->local = local;
+ frame->local = NULL;
+
+ local->op = GF_FOP_WRITE;
+
+ local->transaction.fop = afr_writev_wind;
+ local->transaction.done = afr_writev_done;
+ local->transaction.unwind = afr_writev_unwind;
+
+ local->transaction.main_frame = frame;
+ if (local->fd->flags & O_APPEND) {
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ } else {
+ local->transaction.start = local->cont.writev.offset;
+ local->transaction.len = iov_length (local->cont.writev.vector,
+ local->cont.writev.count);
+ }
+
+ afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL);
+ }
+
+ return 0;
+}
+
+
+int
afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count, off_t offset,
struct iobref *iobref)
{
afr_private_t * priv = NULL;
afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
int ret = -1;
int op_ret = -1;
int op_errno = 0;
+ uint64_t ctx;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (this->private, out);
priv = this->private;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
-
ALLOC_OR_GOTO (local, afr_local_t, out);
ret = AFR_LOCAL_INIT (local, priv);
@@ -239,37 +286,38 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
goto out;
}
- transaction_frame->local = local;
+ frame->local = local;
- local->op = GF_FOP_WRITE;
local->cont.writev.vector = iov_dup (vector, count);
local->cont.writev.count = count;
local->cont.writev.offset = offset;
local->cont.writev.ino = fd->inode->ino;
local->cont.writev.iobref = iobref_ref (iobref);
- local->transaction.fop = afr_writev_wind;
- local->transaction.done = afr_writev_done;
- local->transaction.unwind = afr_writev_unwind;
-
local->fd = fd_ref (fd);
- local->transaction.main_frame = frame;
- if (fd->flags & O_APPEND) {
- local->transaction.start = 0;
- local->transaction.len = 0;
- } else {
- local->transaction.start = offset;
- local->transaction.len = iov_length (vector, count);
- }
+ ret = fd_ctx_get (fd, this, &ctx);
+ if (ret < 0) {
+ goto out;
+ }
- afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+
+ if (fd_ctx->down_count < priv->down_count) {
+ local->up_down_flush_cbk = afr_do_writev;
+ afr_up_down_flush (frame, this, fd, AFR_CHILD_DOWN_FLUSH);
+
+ } else if (fd_ctx->up_count < priv->up_count) {
+ local->up_down_flush_cbk = afr_do_writev;
+ afr_up_down_flush (frame, this, fd, AFR_CHILD_UP_FLUSH);
+
+ } else {
+ afr_do_writev (frame, this);
+ }
op_ret = 0;
out:
if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL);
}
@@ -648,6 +696,52 @@ afr_ftruncate_done (call_frame_t *frame, xlator_t *this)
int
+afr_do_ftruncate (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t * transaction_frame = NULL;
+ afr_local_t * local = NULL;
+
+ int op_ret = -1;
+ int op_errno = 0;
+
+ local = frame->local;
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Out of memory.");
+ goto out;
+ }
+
+ transaction_frame->local = local;
+ frame->local = NULL;
+
+ local->op = GF_FOP_FTRUNCATE;
+
+ local->transaction.fop = afr_ftruncate_wind;
+ local->transaction.done = afr_ftruncate_done;
+ local->transaction.unwind = afr_ftruncate_unwind;
+
+ local->transaction.main_frame = frame;
+
+ local->transaction.start = 0;
+ local->transaction.len = local->cont.ftruncate.offset;
+
+ afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, NULL);
+ }
+
+ return 0;
+}
+
+
+int
afr_ftruncate (call_frame_t *frame, xlator_t *this,
fd_t *fd, off_t offset)
{
@@ -660,19 +754,15 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this,
int op_ret = -1;
int op_errno = 0;
+ uint64_t ctx;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (this->private, out);
priv = this->private;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
-
ALLOC_OR_GOTO (local, afr_local_t, out);
ret = AFR_LOCAL_INIT (local, priv);
@@ -681,25 +771,26 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this,
goto out;
}
- transaction_frame->local = local;
-
- local->op = GF_FOP_FTRUNCATE;
- local->op_ret = -1;
+ frame->local = local;
local->cont.ftruncate.offset = offset;
local->cont.ftruncate.ino = fd->inode->ino;
- local->transaction.fop = afr_ftruncate_wind;
- local->transaction.done = afr_ftruncate_done;
- local->transaction.unwind = afr_ftruncate_unwind;
-
local->fd = fd_ref (fd);
- local->transaction.main_frame = frame;
- local->transaction.start = 0;
- local->transaction.len = offset;
+ ret = fd_ctx_get (fd, this, &ctx);
+ if (ret < 0) {
+ goto out;
+ }
- afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+
+ if (fd_ctx->down_count < priv->down_count) {
+ local->up_down_flush_cbk = afr_do_ftruncate;
+ afr_up_down_flush (frame, this, fd, AFR_CHILD_DOWN_FLUSH);
+ } else {
+ afr_do_ftruncate (frame, this);
+ }
op_ret = 0;
out:
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
new file mode 100644
index 000000000..945f5cddf
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-open.c
@@ -0,0 +1,356 @@
+/*
+ Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+#include "byte-order.h"
+#include "statedump.h"
+
+#include "fd.h"
+
+#include "afr-inode-read.h"
+#include "afr-inode-write.h"
+#include "afr-dir-read.h"
+#include "afr-dir-write.h"
+#include "afr-transaction.h"
+
+#include "afr-self-heal.h"
+
+
+int
+afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *prebuf,
+ struct stat *postbuf)
+{
+ afr_local_t * local = frame->local;
+ int ret = 0;
+
+ ret = afr_fd_ctx_set (this, local->fd);
+
+ if (ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -ret;
+ }
+
+ AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno,
+ local->fd);
+ return 0;
+}
+
+
+int
+afr_open_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ fd_t *fd)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+
+ int ret = 0;
+
+ int call_count = -1;
+
+ priv = this->private;
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ }
+
+ if (op_ret >= 0) {
+ local->op_ret = op_ret;
+ local->success_count++;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if ((local->cont.open.flags & O_TRUNC)
+ && (local->op_ret >= 0)) {
+ STACK_WIND (frame, afr_open_ftruncate_cbk,
+ this, this->fops->ftruncate,
+ fd, 0);
+ } else {
+ ret = afr_fd_ctx_set (this, fd);
+
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "could not set fd ctx for fd=%p",
+ fd);
+
+ local->op_ret = -1;
+ local->op_errno = -ret;
+ }
+
+ AFR_STACK_UNWIND (open, frame, local->op_ret,
+ local->op_errno, local->fd);
+ }
+ }
+
+ return 0;
+}
+
+
+int
+afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, int32_t wbflags)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int i = 0;
+ int ret = -1;
+
+ int32_t call_count = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ int32_t wind_flags = flags & (~O_TRUNC);
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ priv = this->private;
+
+ if (afr_is_split_brain (this, loc->inode)) {
+ /* self-heal failed */
+ op_errno = EIO;
+ goto out;
+ }
+
+ ALLOC_OR_GOTO (local, afr_local_t, out);
+
+ ret = AFR_LOCAL_INIT (local, priv);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ frame->local = local;
+ call_count = local->call_count;
+
+ local->cont.open.flags = flags;
+ local->fd = fd_ref (fd);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->open,
+ loc, wind_flags, fd, wbflags);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ op_ret = 0;
+out:
+ if (op_ret == -1) {
+ AFR_STACK_UNWIND (open, frame, op_ret, op_errno, fd);
+ }
+
+ return 0;
+}
+
+
+int
+afr_up_down_flush_sh_completion_cbk (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->transaction.post_post_op (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_up_down_flush_post_post_op (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sh->calling_fop = GF_FOP_FLUSH;
+
+// sh->healing_fd = local->fd;
+
+// sh->healing_fd_opened = _gf_true;
+
+ local->cont.lookup.inode = local->fd->inode;
+
+ inode_path (local->fd->inode, NULL, (char **)&local->loc.path);
+ local->loc.name = strrchr (local->loc.path, '/');
+ local->loc.inode = inode_ref (local->fd->inode);
+ local->loc.parent = inode_parent (local->fd->inode, 0, NULL);
+
+ sh->data_lock_held = _gf_true;
+
+ local->need_data_self_heal = _gf_true;
+ local->cont.lookup.buf.st_mode = local->fd->inode->st_mode;
+ local->child_count = afr_up_children_count (priv->child_count,
+ local->child_up);
+
+ sh->flush_self_heal_cbk = afr_up_down_flush_sh_completion_cbk;
+
+ afr_self_heal (frame, this, afr_up_down_flush_sh_completion_cbk,
+ _gf_false);
+
+ return 0;
+}
+
+
+int
+afr_up_down_flush_wind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ local->transaction.resume (frame, this);
+ return 0;
+}
+
+
+int
+afr_up_down_flush_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ uint64_t ctx;
+ afr_fd_ctx_t * fd_ctx = NULL;
+
+ int _ret = -1;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ LOCK (&local->fd->lock);
+ {
+ _ret = __fd_ctx_get (local->fd, this, &ctx);
+
+ if (_ret < 0) {
+ goto out;
+ }
+
+ fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+
+ fd_ctx->down_count = priv->down_count;
+ fd_ctx->up_count = priv->up_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i])
+ fd_ctx->pre_op_done[i] = 0;
+ }
+ }
+out:
+ UNLOCK (&local->fd->lock);
+
+ local->up_down_flush_cbk (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ afr_flush_type type)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+
+ int op_ret = -1;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ local = frame->local;
+
+ local->op = GF_FOP_FLUSH;
+
+ local->fd = fd_ref (local->fd);
+
+ local->transaction.fop = afr_up_down_flush_wind;
+ local->transaction.done = afr_up_down_flush_done;
+
+ switch (type) {
+ case AFR_CHILD_UP_FLUSH:
+ local->transaction.post_post_op = afr_up_down_flush_post_post_op;
+ break;
+
+ case AFR_CHILD_DOWN_FLUSH:
+ local->transaction.post_post_op = NULL;
+ break;
+ }
+
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "doing up/down flush on fd=%p",
+ fd);
+
+ afr_transaction (frame, this, AFR_FLUSH_TRANSACTION);
+
+ op_ret = 0;
+out:
+ return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index a60331d51..0b4f22b01 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -1379,7 +1379,8 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this)
memcpy (lc, l, sizeof (afr_local_t));
- loc_copy (&lc->loc, &l->loc);
+ if (l->loc.path)
+ loc_copy (&lc->loc, &l->loc);
lc->child_up = memdup (l->child_up, priv->child_count);
if (l->xattr_req)
@@ -1412,12 +1413,16 @@ afr_bgsh_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this)
"background self-heal completed");
if (!sh->unwound) {
- AFR_STACK_UNWIND (lookup, sh->orig_frame,
- local->op_ret, local->op_errno,
- local->cont.lookup.inode,
- &local->cont.lookup.buf,
- local->cont.lookup.xattr,
- &local->cont.lookup.postparent);
+ if (sh->calling_fop == GF_FOP_LOOKUP) {
+ AFR_STACK_UNWIND (lookup, sh->orig_frame,
+ local->op_ret, local->op_errno,
+ local->cont.lookup.inode,
+ &local->cont.lookup.buf,
+ local->cont.lookup.xattr,
+ &local->cont.lookup.postparent);
+ } else {
+ sh->flush_self_heal_cbk (sh->orig_frame, this);
+ }
}
LOCK (&priv->lock);
@@ -1450,12 +1455,16 @@ afr_bgsh_unwind (call_frame_t *bgsh_frame, xlator_t *this)
sh->unwound = _gf_true;
- AFR_STACK_UNWIND (lookup, sh->orig_frame,
- local->op_ret, local->op_errno,
- local->cont.lookup.inode,
- &local->cont.lookup.buf,
- local->cont.lookup.xattr,
- &local->cont.lookup.postparent);
+ if (sh->calling_fop == GF_FOP_LOOKUP) {
+ AFR_STACK_UNWIND (lookup, sh->orig_frame,
+ local->op_ret, local->op_errno,
+ local->cont.lookup.inode,
+ &local->cont.lookup.buf,
+ local->cont.lookup.xattr,
+ &local->cont.lookup.postparent);
+ } else {
+ sh->flush_self_heal_cbk (sh->orig_frame, this);
+ }
return 0;
}
@@ -1463,7 +1472,8 @@ afr_bgsh_unwind (call_frame_t *bgsh_frame, xlator_t *this)
int
afr_self_heal (call_frame_t *frame, xlator_t *this,
- int (*completion_cbk) (call_frame_t *, xlator_t *))
+ int (*completion_cbk) (call_frame_t *, xlator_t *),
+ int bgsh)
{
afr_local_t *local = NULL;
afr_self_heal_t *sh = NULL;
@@ -1498,13 +1508,18 @@ afr_self_heal (call_frame_t *frame, xlator_t *this,
sh_frame->local = sh_local;
sh = &sh_local->self_heal;
- sh->background = _gf_true;
sh->orig_frame = frame;
- if (completion_cbk == NULL)
- sh->completion_cbk = afr_bgsh_completion_cbk;
+ if (bgsh)
+ sh->background = _gf_true;
else
+ sh->background = _gf_false;
+
+ if (completion_cbk == NULL) {
+ sh->completion_cbk = afr_bgsh_completion_cbk;
+ } else {
sh->completion_cbk = completion_cbk;
+ }
sh->unwind = afr_bgsh_unwind;
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 46d074831..e8384ec30 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -80,7 +80,7 @@ afr_sh_data_done (call_frame_t *frame, xlator_t *this)
"self heal of %s completed",
local->loc.path);
- sh->completion_cbk (frame, this);
+ sh->completion_cbk (sh->orig_frame, this);
return 0;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 7c4dd99b7..84a1380b7 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -47,6 +47,7 @@ afr_self_heal_metadata (call_frame_t *frame, xlator_t *this);
int
afr_self_heal (call_frame_t *frame, xlator_t *this,
- int (*completion_cbk) (call_frame_t *, xlator_t *));
+ int (*completion_cbk) (call_frame_t *, xlator_t *),
+ int bgsh);
#endif /* __AFR_SELF_HEAL_H__ */
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index f7604dbf6..94f0972a1 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -134,6 +134,34 @@ out:
static void
+__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index)
+{
+ afr_local_t *local = NULL;
+
+ uint64_t ctx;
+ afr_fd_ctx_t * fd_ctx = NULL;
+ int ret = 0;
+
+ local = frame->local;
+
+ ret = fd_ctx_get (local->fd, this, &ctx);
+
+ if (ret < 0)
+ goto out;
+
+ fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+
+ if ((local->op == GF_FOP_WRITE)
+ || (local->op == GF_FOP_FTRUNCATE)) {
+ fd_ctx->pre_op_done[child_index] = 1;
+ }
+
+out:
+ return;
+}
+
+
+static void
__mark_down_children (int32_t *pending[], int child_count,
unsigned char *child_up, afr_transaction_type type)
{
@@ -168,10 +196,15 @@ __is_first_write_on_fd (xlator_t *this, fd_t *fd)
{
int op_ret = 0;
int _ret = -1;
+ int i = 0;
uint64_t ctx;
afr_fd_ctx_t * fd_ctx = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
LOCK (&fd->lock);
{
_ret = __fd_ctx_get (fd, this, &ctx);
@@ -185,9 +218,12 @@ __is_first_write_on_fd (xlator_t *this, fd_t *fd)
fd_ctx = (afr_fd_ctx_t *)(long) ctx;
- if (fd_ctx->pre_op_done == 0) {
- fd_ctx->pre_op_done = 1;
- op_ret = 1;
+ op_ret = 1;
+ for (i = 0; i < priv->child_count; i++) {
+ if (fd_ctx->pre_op_done[i] == 0)
+ continue;
+
+ op_ret = 0;
}
}
out:
@@ -198,7 +234,7 @@ out:
static int
-__if_fd_pre_op_done (xlator_t *this, fd_t *fd)
+__if_fd_pre_op_done (xlator_t *this, fd_t *fd, int child_index)
{
int op_ret = 0;
int _ret = -1;
@@ -216,8 +252,7 @@ __if_fd_pre_op_done (xlator_t *this, fd_t *fd)
fd_ctx = (afr_fd_ctx_t *)(long) ctx;
- if (fd_ctx->pre_op_done) {
- fd_ctx->pre_op_done = 0;
+ if (fd_ctx->pre_op_done[child_index]) {
op_ret = 1;
}
}
@@ -229,6 +264,43 @@ out:
static int
+afr_pre_op_done_count (xlator_t *this, fd_t *fd, unsigned char *child_up)
+{
+ int i = 0;
+ int count = 0;
+
+ int _ret = 0;
+ uint64_t ctx;
+ afr_fd_ctx_t * fd_ctx = NULL;
+
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ LOCK (&fd->lock);
+ {
+ _ret = __fd_ctx_get (fd, this, &ctx);
+
+ if (_ret < 0) {
+ goto out;
+ }
+
+ fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (fd_ctx->pre_op_done[i] && child_up[i]) {
+ count++;
+ }
+ }
+ }
+out:
+ UNLOCK (&fd->lock);
+
+ return count;
+}
+
+
+static int
__changelog_enabled (afr_private_t *priv, afr_transaction_type type)
{
int ret = 0;
@@ -326,7 +398,7 @@ __changelog_needed_post_op (call_frame_t *frame, xlator_t *this)
break;
case GF_FOP_FLUSH:
- op_ret = __if_fd_pre_op_done (this, local->fd);
+ op_ret = 1;
break;
default:
@@ -665,11 +737,15 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
dict_ref (xattr[i]);
}
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ if (local->op == GF_FOP_FLUSH) {
+ call_count = afr_pre_op_done_count (this, local->fd, local->child_up);
+ } else {
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
- if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
- call_count *= 2;
- }
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ call_count *= 2;
+ }
+ }
local->call_count = call_count;
@@ -696,20 +772,33 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
{
if (local->fd)
STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
+ priv->children[i],
priv->children[i]->fops->fxattrop,
- local->fd,
+ local->fd,
GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
+ else
STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
+ priv->children[i],
priv->children[i]->fops->xattrop,
- &local->loc,
+ &local->loc,
GF_XATTROP_ADD_ARRAY, xattr[i]);
+ call_count--;
+ }
+ break;
+
+ case AFR_FLUSH_TRANSACTION:
+ {
+ if (__if_fd_pre_op_done (this, local->fd, i)) {
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ call_count--;
+ }
}
break;
@@ -756,11 +845,12 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
priv->children[i]->fops->xattrop,
&local->transaction.parent_loc,
GF_XATTROP_ADD_ARRAY, xattr[i]);
+ call_count--;
}
break;
}
- if (!--call_count)
+ if (!call_count)
break;
}
}
@@ -789,6 +879,10 @@ afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
LOCK (&frame->lock);
{
+ if (op_ret == 0) {
+ __mark_pre_op_done_on_fd (frame, this, child_index);
+ }
+
if (op_ret == -1) {
local->child_up[child_index] = 0;
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index d6c1d8bcf..467b8112d 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -277,7 +277,7 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)
if (sh->locked_nodes)
FREE (sh->locked_nodes);
- if (sh->healing_fd) {
+ if (sh->healing_fd && !sh->healing_fd_opened) {
fd_unref (sh->healing_fd);
sh->healing_fd = NULL;
}
@@ -694,7 +694,10 @@ unlock:
lookup_buf->st_mode;
}
- afr_self_heal (frame, this, NULL);
+ local->self_heal.calling_fop = GF_FOP_LOOKUP;
+
+ afr_self_heal (frame, this, NULL, _gf_true);
+
} else {
AFR_STACK_UNWIND (lookup, frame, local->op_ret,
local->op_errno,
@@ -845,6 +848,15 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd)
goto unlock;
}
+ fd_ctx->pre_op_done = CALLOC (sizeof (*fd_ctx->pre_op_done),
+ priv->child_count);
+ if (!fd_ctx->pre_op_done) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Out of memory");
+ op_ret = -ENOMEM;
+ goto unlock;
+ }
+
fd_ctx->child_failed = CALLOC (sizeof (*fd_ctx->child_failed),
priv->child_count);
@@ -856,6 +868,9 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd)
goto unlock;
}
+ fd_ctx->up_count = priv->up_count;
+ fd_ctx->down_count = priv->down_count;
+
ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx);
if (ret < 0) {
op_ret = ret;
@@ -867,149 +882,6 @@ out:
return ret;
}
-
-int
-afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct stat *prebuf,
- struct stat *postbuf)
-{
- afr_local_t * local = frame->local;
- int ret = 0;
-
- ret = afr_fd_ctx_set (this, local->fd);
-
- if (ret < 0) {
- local->op_ret = -1;
- local->op_errno = -ret;
- }
-
- AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno,
- local->fd);
- return 0;
-}
-
-
-int
-afr_open_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- fd_t *fd)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int ret = 0;
-
- int call_count = -1;
-
- priv = this->private;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- }
-
- if (op_ret >= 0) {
- local->op_ret = op_ret;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if ((local->cont.open.flags & O_TRUNC)
- && (local->op_ret >= 0)) {
- STACK_WIND (frame, afr_open_ftruncate_cbk,
- this, this->fops->ftruncate,
- fd, 0);
- } else {
- ret = afr_fd_ctx_set (this, fd);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "could not set fd ctx for fd=%p",
- fd);
-
- local->op_ret = -1;
- local->op_errno = -ret;
- }
-
- AFR_STACK_UNWIND (open, frame, local->op_ret,
- local->op_errno, local->fd);
- }
- }
-
- return 0;
-}
-
-
-int
-afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- fd_t *fd, int32_t wbflags)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
-
- int i = 0;
- int ret = -1;
-
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int32_t wind_flags = flags & (~O_TRUNC);
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
-
- if (afr_is_split_brain (this, loc->inode)) {
- /* self-heal failed */
- op_errno = EIO;
- goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- frame->local = local;
- call_count = local->call_count;
-
- local->cont.open.flags = flags;
- local->fd = fd_ref (fd);
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->open,
- loc, wind_flags, fd, wbflags);
-
- if (!--call_count)
- break;
- }
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (open, frame, op_ret, op_errno, fd);
- }
-
- return 0;
-}
-
-/* }}} */
-
/* {{{ flush */
int
@@ -2385,6 +2257,12 @@ notify (xlator_t *this, int32_t event,
child_up[i] = 1;
+ LOCK (&priv->lock);
+ {
+ priv->up_count++;
+ }
+ UNLOCK (&priv->lock);
+
/*
if all the children were down, and one child came up,
send notify to parent
@@ -2408,6 +2286,12 @@ notify (xlator_t *this, int32_t event,
i = find_child_index (this, data);
child_up[i] = 0;
+
+ LOCK (&priv->lock);
+ {
+ priv->down_count++;
+ }
+ UNLOCK (&priv->lock);
/*
if all children are down, and this was the last to go down,
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 23e75e612..56f7a069d 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -69,6 +69,9 @@ typedef struct _afr_private {
unsigned int entry_lock_server_count;
unsigned int wait_count; /* # of servers to wait for success */
+
+ uint64_t up_count; /* number of CHILD_UPs we have seen */
+ uint64_t down_count; /* number of CHILD_DOWNs we have seen */
} afr_private_t;
typedef struct {
@@ -76,6 +79,8 @@ typedef struct {
directories? */
gf_boolean_t forced_merge;
+ glusterfs_fop_t calling_fop;
+
/* array of stat's, one for each child */
struct stat *buf;
struct stat parentbuf;
@@ -124,6 +129,8 @@ typedef struct {
gf_boolean_t data_lock_held; /* set by caller: true if caller
has already acquired 0-0 lock */
+ int (*flush_self_heal_cbk) (call_frame_t *frame, xlator_t *this);
+
int (*completion_cbk) (call_frame_t *frame, xlator_t *this);
int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this);
int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this);
@@ -168,6 +175,12 @@ afr_index_for_transaction_type (afr_transaction_type type)
}
+typedef enum {
+ AFR_CHILD_UP_FLUSH,
+ AFR_CHILD_DOWN_FLUSH,
+} afr_flush_type;
+
+
typedef struct _afr_local {
unsigned int call_count;
unsigned int success_count;
@@ -203,9 +216,12 @@ typedef struct _afr_local {
dict_t *xattr_req;
int open_fd_count;
+
int32_t inodelk_count;
int32_t entrylk_count;
+ int (*up_down_flush_cbk) (call_frame_t *, xlator_t *);
+
/*
This struct contains the arguments for the "continuation"
(scheme-like) of fops
@@ -503,8 +519,10 @@ typedef struct _afr_local {
typedef struct {
- unsigned char pre_op_done;
+ unsigned char *pre_op_done;
unsigned char *child_failed;
+ uint64_t up_count; /* number of CHILD_UPs this fd has seen */
+ uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */
} afr_fd_ctx_t;
@@ -560,9 +578,19 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this);
int
afr_frame_return (call_frame_t *frame);
+uint64_t
+afr_is_split_brain (xlator_t *this, inode_t *inode);
+
void
afr_set_split_brain (xlator_t *this, inode_t *inode);
+int
+afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, int32_t wbflags);
+
+int
+afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, afr_flush_type type);
+
void
afr_set_opendir_done (xlator_t *this, inode_t *inode);