diff options
Diffstat (limited to 'xlators/cluster/afr')
20 files changed, 14019 insertions, 0 deletions
diff --git a/xlators/cluster/afr/Makefile.am b/xlators/cluster/afr/Makefile.am new file mode 100644 index 00000000000..d471a3f9243 --- /dev/null +++ b/xlators/cluster/afr/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am new file mode 100644 index 00000000000..1bde9e5bad7 --- /dev/null +++ b/xlators/cluster/afr/src/Makefile.am @@ -0,0 +1,20 @@ +xlator_LTLIBRARIES = afr.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +afr_la_LDFLAGS = -module -avoidversion + +afr_la_SOURCES = afr.c afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c +afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/replicate.so + +install-data-hook: + ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so
\ No newline at end of file diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c new file mode 100644 index 00000000000..0c65ca8528d --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -0,0 +1,345 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" + + +int32_t +afr_opendir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + fd_t *fd) +{ + afr_local_t * local = NULL; + + int call_count = -1; + + LOCK (&frame->lock); + { + local = frame->local; + + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + AFR_STACK_UNWIND (frame, local->op_ret, + local->op_errno, local->fd); + } + + return 0; +} + + +int32_t +afr_opendir (call_frame_t *frame, xlator_t *this, + loc_t *loc, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int child_count = 0; + int i = 0; + + int ret = -1; + int call_count = -1; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + child_count = priv->child_count; + + ALLOC_OR_GOTO (local, afr_local_t, out); + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + local->fd = fd_ref (fd); + + call_count = local->call_count; + + for (i = 0; i < child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_opendir_cbk, + priv->children[i], + priv->children[i]->fops->opendir, + loc, fd); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, fd); + } + + return 0; +} + + +/** + * Common algorithm for directory read calls: + * + * - Try the fop on the first child that is up + * - if we have failed due to ENOTCONN: + * try the next child + * + * Applicable to: readdir + */ + +int32_t +afr_readdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + gf_dirent_t *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.readdir.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + + this_try = ++local->cont.readdir.last_tried; + unwind = 0; + + STACK_WIND (frame, afr_readdir_cbk, + children[this_try], + children[this_try]->fops->readdir, + local->fd, local->cont.readdir.size, + local->cont.readdir.offset); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + } + + return 0; +} + + +int32_t +afr_readdir (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + + int ret = -1; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.readdir.last_tried = call_child; + + local->fd = fd_ref (fd); + local->cont.readdir.size = size; + local->cont.readdir.offset = offset; + + STACK_WIND (frame, afr_readdir_cbk, + children[call_child], children[call_child]->fops->readdir, + fd, size, offset); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +int32_t +afr_getdents_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dir_entry_t *entry, int32_t count) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.getdents.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + + this_try = ++local->cont.getdents.last_tried; + unwind = 0; + + STACK_WIND (frame, afr_getdents_cbk, + children[this_try], + children[this_try]->fops->getdents, + local->fd, local->cont.getdents.size, + local->cont.getdents.offset, local->cont.getdents.flag); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, entry, count); + } + + return 0; +} + + +int32_t +afr_getdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, int32_t flag) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.getdents.last_tried = call_child; + + local->fd = fd_ref (fd); + + local->cont.getdents.size = size; + local->cont.getdents.offset = offset; + local->cont.getdents.flag = flag; + + frame->local = local; + + STACK_WIND (frame, afr_getdents_cbk, + children[call_child], children[call_child]->fops->getdents, + fd, size, offset, flag); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h new file mode 100644 index 00000000000..172ec3c90c4 --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-read.h @@ -0,0 +1,47 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __DIR_READ_H__ +#define __DIR_READ_H__ + + +int32_t +afr_opendir (call_frame_t *frame, xlator_t *this, + loc_t *loc, fd_t *fd); + +int32_t +afr_closedir (call_frame_t *frame, xlator_t *this, + fd_t *fd); + +int32_t +afr_readdir (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset); + + +int32_t +afr_getdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, int32_t flag); + + +int32_t +afr_checksum (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags); + + +#endif /* __DIR_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c new file mode 100644 index 00000000000..87a6e09b5be --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -0,0 +1,1786 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" +#include "afr-transaction.h" + + +void +afr_build_parent_loc (loc_t *parent, loc_t *child) +{ + char *tmp = NULL; + + if (!child->parent) { + loc_copy (parent, child); + return; + } + + tmp = strdup (child->path); + parent->path = strdup (dirname (tmp)); + FREE (tmp); + + parent->name = strrchr (parent->path, '/'); + if (parent->name) + parent->name++; + + parent->inode = inode_ref (child->parent); + parent->parent = inode_parent (parent->inode, 0, NULL); + parent->ino = parent->inode->ino; +} + + +/* {{{ create */ + +int +afr_create_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.create.fd, + local->cont.create.inode, + &local->cont.create.buf); + return 0; +} + + +int +afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.create.buf = *buf; + local->cont.create.buf.st_ino = + afr_itransform (buf->st_ino, + priv->child_count, + child_index); + } + local->cont.create.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_create_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_create_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->create, + &local->loc, + local->cont.create.flags, + local->cont.create.mode, + local->cont.create.fd); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_create_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->cont.create.flags = flags; + local->cont.create.mode = mode; + local->cont.create.fd = fd_ref (fd); + + local->transaction.fop = afr_create_wind; + local->transaction.done = afr_create_done; + local->transaction.unwind = afr_create_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ mknod */ + +int +afr_mknod_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.mknod.inode, + &local->cont.mknod.buf); + return 0; +} + + +int +afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.mknod.buf = *buf; + local->cont.mknod.buf.st_ino = + afr_itransform (buf->st_ino, + priv->child_count, + child_index); + } + local->cont.mknod.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_mknod_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mknod, + &local->loc, local->cont.mknod.mode, + local->cont.mknod.dev); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_mknod_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_mknod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, dev_t dev) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->cont.mknod.mode = mode; + local->cont.mknod.dev = dev; + + local->transaction.fop = afr_mknod_wind; + local->transaction.done = afr_mknod_done; + local->transaction.unwind = afr_mknod_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ mkdir */ + + +int +afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.mkdir.inode, + &local->cont.mkdir.buf); + return 0; +} + + +int +afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.mkdir.buf = *buf; + local->cont.mkdir.buf.st_ino = + afr_itransform (buf->st_ino, priv->child_count, + child_index); + } + local->cont.mkdir.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_mkdir_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mkdir, + &local->loc, local->cont.mkdir.mode); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_mkdir_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->cont.mkdir.mode = mode; + + local->transaction.fop = afr_mkdir_wind; + local->transaction.done = afr_mkdir_done; + local->transaction.unwind = afr_mkdir_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ link */ + + +int +afr_link_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.link.buf.st_ino = local->cont.link.ino; + + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.link.inode, + &local->cont.link.buf); + } + + return 0; +} + + +int +afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.link.buf = *buf; + local->cont.link.buf.st_ino = + afr_itransform (buf->st_ino, priv->child_count, + child_index); + } + local->cont.link.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_link_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->link, + &local->loc, + &local->newloc); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_link_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, oldloc); + loc_copy (&local->newloc, newloc); + + local->cont.link.ino = oldloc->inode->ino; + + local->transaction.fop = afr_link_wind; + local->transaction.done = afr_link_done; + local->transaction.unwind = afr_link_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, oldloc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (oldloc->path); + local->transaction.new_basename = AFR_BASENAME (newloc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ + +/* {{{ symlink */ + + +int +afr_symlink_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.symlink.inode, + &local->cont.symlink.buf); + return 0; +} + + +int +afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.symlink.buf = *buf; + local->cont.symlink.buf.st_ino = + afr_itransform (buf->st_ino, priv->child_count, + child_index); + } + local->cont.symlink.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_symlink_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->symlink, + local->cont.symlink.linkpath, + &local->loc); + + if (!--call_count) + break; + + } + } + + return 0; +} + + +int +afr_symlink_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_symlink (call_frame_t *frame, xlator_t *this, + const char *linkpath, loc_t *loc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->cont.symlink.ino = loc->inode->ino; + local->cont.symlink.linkpath = strdup (linkpath); + + local->transaction.fop = afr_symlink_wind; + local->transaction.done = afr_symlink_done; + local->transaction.unwind = afr_symlink_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ rename */ + +int +afr_rename_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.rename.buf.st_ino = local->cont.rename.ino; + + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.rename.buf); + } + + return 0; +} + + +int +afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if ((op_ret != -1) && (local->success_count == 0)) { + local->op_ret = op_ret; + + if (buf) { + local->cont.rename.buf = *buf; + local->cont.rename.buf.st_ino = + afr_itransform (buf->st_ino, priv->child_count, + child_index); + } + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_rename_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->rename, + &local->loc, + &local->newloc); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_rename_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, oldloc); + loc_copy (&local->newloc, newloc); + + local->cont.rename.ino = oldloc->inode->ino; + + local->transaction.fop = afr_rename_wind; + local->transaction.done = afr_rename_done; + local->transaction.unwind = afr_rename_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, oldloc); + afr_build_parent_loc (&local->transaction.new_parent_loc, newloc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (oldloc->path); + local->transaction.new_basename = AFR_BASENAME (newloc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ unlink */ + +int +afr_unlink_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_unlink_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->unlink, + &local->loc); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int32_t +afr_unlink_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int32_t +afr_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->transaction.fop = afr_unlink_wind; + local->transaction.done = afr_unlink_done; + local->transaction.unwind = afr_unlink_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ + +/* {{{ rmdir */ + + + +int +afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + } + local->success_count++; + + if (local->success_count == priv->wait_count) + need_unwind = 1; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_rmdir_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->rmdir, + &local->loc); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_rmdir_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->transaction.fop = afr_rmdir_wind; + local->transaction.done = afr_rmdir_done; + local->transaction.unwind = afr_rmdir_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ + +/* {{{ setdents */ + +int32_t +afr_setdents_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if ((op_ret != -1) && (local->success_count == 0)) { + local->op_ret = op_ret; + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_setdents_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_setdents_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setdents, + local->fd, local->cont.setdents.flags, + local->cont.setdents.entries, + local->cont.setdents.count); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int32_t +afr_setdents_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_setdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + + local->fd = fd_ref (fd); + + local->cont.setdents.flags = flags; + local->cont.setdents.entries = entries; + local->cont.setdents.count = count; + + local->transaction.fop = afr_setdents_wind; + local->transaction.done = afr_setdents_done; + + local->transaction.basename = NULL; + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h new file mode 100644 index 00000000000..e6e8a5e797c --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-write.h @@ -0,0 +1,59 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __DIR_WRITE_H__ +#define __DIR_WRITE_H__ + +int32_t +afr_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd); + +int32_t +afr_mknod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, dev_t dev); + +int32_t +afr_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode); + +int32_t +afr_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc); + +int32_t +afr_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc); + +int32_t +afr_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc); + +int32_t +afr_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc); + +int32_t +afr_symlink (call_frame_t *frame, xlator_t *this, + const char *linkpath, loc_t *oldloc); + +int32_t +afr_setdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count); + +#endif /* __DIR_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c new file mode 100644 index 00000000000..a6c99ec0576 --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -0,0 +1,721 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" + + +/** + * Common algorithm for inode read calls: + * + * - Try the fop on the first child that is up + * - if we have failed due to ENOTCONN: + * try the next child + * + * Applicable to: access, stat, fstat, readlink, getxattr + */ + +/* {{{ access */ + +int32_t +afr_access_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.access.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.access.last_tried; + + unwind = 0; + + STACK_WIND_COOKIE (frame, afr_access_cbk, + (void *) (long) this_try, + children[this_try], + children[this_try]->fops->access, + &local->loc, local->cont.access.mask); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + + +int32_t +afr_access (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t mask) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.access.last_tried = call_child; + loc_copy (&local->loc, loc); + local->cont.access.mask = mask; + + STACK_WIND_COOKIE (frame, afr_access_cbk, + (void *) (long) call_child, + children[call_child], children[call_child]->fops->access, + loc, mask); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +/* }}} */ + +/* {{{ stat */ + +int32_t +afr_stat_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int deitransform_child = -1; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + deitransform_child = (long) cookie; + + local = frame->local; + + if (op_ret == -1) { + retry: + last_tried = local->cont.stat.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.stat.last_tried; + + if (this_try == deitransform_child) { + goto retry; + } + + unwind = 0; + + STACK_WIND_COOKIE (frame, afr_stat_cbk, + (void *) (long) deitransform_child, + children[this_try], + children[this_try]->fops->stat, + &local->loc); + } + +out: + if (unwind) { + if (op_ret != -1) + buf->st_ino = local->cont.stat.ino; + + AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + } + + return 0; +} + + +int32_t +afr_stat (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int call_child = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + frame->local = local; + + call_child = afr_deitransform (loc->inode->ino, priv->child_count); + loc_copy (&local->loc, loc); + + /* + if stat fails from the deitranform'd child, we try + all children starting with the first one + */ + local->cont.stat.last_tried = -1; + local->cont.stat.ino = loc->inode->ino; + + STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, + children[call_child], + children[call_child]->fops->stat, + loc); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + +/* }}} */ + +/* {{{ fstat */ + +int32_t +afr_fstat_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int deitransform_child = -1; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + deitransform_child = (long) cookie; + + local = frame->local; + + if (op_ret == -1) { + retry: + last_tried = local->cont.fstat.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.fstat.last_tried; + + if (this_try == deitransform_child) { + /* + skip the deitransform'd child since if we are here + we must have already tried that child + */ + goto retry; + } + + + unwind = 0; + + STACK_WIND_COOKIE (frame, afr_fstat_cbk, + (void *) (long) deitransform_child, + children[this_try], + children[this_try]->fops->fstat, + local->fd); + } + +out: + if (unwind) { + if (op_ret != -1) + buf->st_ino = local->cont.fstat.ino; + + AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + } + + return 0; +} + + +int32_t +afr_fstat (call_frame_t *frame, xlator_t *this, + fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int call_child = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + frame->local = local; + + VALIDATE_OR_GOTO (fd->inode, out); + + call_child = afr_deitransform (fd->inode->ino, priv->child_count); + + /* + if fstat fails from the deitranform'd child, we try + all children starting with the first one + */ + local->cont.fstat.last_tried = -1; + local->cont.fstat.ino = fd->inode->ino; + local->fd = fd_ref (fd); + + STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, + children[call_child], + children[call_child]->fops->fstat, + fd); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ readlink */ + +int32_t +afr_readlink_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + const char *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.readlink.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.readlink.last_tried; + + unwind = 0; + STACK_WIND_COOKIE (frame, afr_readlink_cbk, + (void *) (long) this_try, + children[this_try], + children[this_try]->fops->readlink, + &local->loc, + local->cont.readlink.size); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + } + + return 0; +} + + +int32_t +afr_readlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, size_t size) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + frame->local = local; + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.readlink.last_tried = call_child; + loc_copy (&local->loc, loc); + local->cont.readlink.size = size; + + STACK_WIND_COOKIE (frame, afr_readlink_cbk, + (void *) (long) call_child, + children[call_child], children[call_child]->fops->readlink, + loc, size); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +/* }}} */ + +/* {{{ getxattr */ + +int32_t +afr_getxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.getxattr.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.getxattr.last_tried; + + unwind = 0; + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, + (void *) (long) this_try, + children[this_try], + children[this_try]->fops->getxattr, + &local->loc, + local->cont.getxattr.name); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, dict); + } + + return 0; +} + + +int32_t +afr_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t * local = NULL; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + frame->local = local; + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.getxattr.last_tried = call_child; + loc_copy (&local->loc, loc); + if (name) + local->cont.getxattr.name = strdup (name); + + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, + (void *) (long) call_child, + children[call_child], children[call_child]->fops->getxattr, + loc, name); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +/* }}} */ + +/* {{{ readv */ + +/** + * read algorithm: + * + * if the user has specified a read subvolume, use it + * otherwise - + * use the inode number to hash it to one of the subvolumes, and + * read from there (to balance read load) + * + * if any of the above read's fail, try the children in sequence + * beginning at the beginning + */ + +int32_t +afr_readv_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + retry: + last_tried = local->cont.readv.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.readv.last_tried; + + if (this_try == priv->read_child) { + /* + skip the read child since if we are here + we must have already tried that child + */ + goto retry; + } + + unwind = 0; + + STACK_WIND_COOKIE (frame, afr_readv_cbk, + (void *) (long) this_try, + children[this_try], + children[this_try]->fops->readv, + local->fd, local->cont.readv.size, + local->cont.readv.offset); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, vector, count, buf); + } + + return 0; +} + + +int32_t +afr_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int call_child = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (fd, out); + + priv = this->private; + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + frame->local = local; + + if (priv->read_child != -1) { + call_child = priv->read_child; + + /* + if read fails from the read child, we try + all children starting with the first one + */ + local->cont.readv.last_tried = -1; + } else { + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.readv.last_tried = call_child; + } + + local->fd = fd_ref (fd); + + local->cont.readv.size = size; + local->cont.readv.offset = offset; + + STACK_WIND_COOKIE (frame, afr_readv_cbk, + (void *) (long) call_child, + children[call_child], + children[call_child]->fops->readv, + fd, size, offset); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, 0, NULL); + } + return 0; +} + +/* }}} */ diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h new file mode 100644 index 00000000000..6b3bd2da850 --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-read.h @@ -0,0 +1,47 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __INODE_READ_H__ +#define __INODE_READ_H__ + +int32_t +afr_access (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t mask); + +int32_t +afr_stat (call_frame_t *frame, xlator_t *this, + loc_t *loc); + +int32_t +afr_fstat (call_frame_t *frame, xlator_t *this, + fd_t *fd); + +int32_t +afr_readlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, size_t size); + +int32_t +afr_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset); + +int32_t +afr_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name); + +#endif /* __INODE_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c new file mode 100644 index 00000000000..267350b2c4a --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -0,0 +1,2024 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" +#include "afr-transaction.h" + + +/* {{{ chmod */ + + +int +afr_chmod_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.chmod.buf.st_ino = local->cont.chmod.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.chmod.buf); + } + return 0; +} + + +int +afr_chmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.chmod.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + afr_chmod_unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_chmod_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int i = 0; + int call_count = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_chmod_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->chmod, + &local->loc, + local->cont.chmod.mode); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_chmod_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int32_t +afr_chmod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->cont.chmod.mode = mode; + local->cont.chmod.ino = loc->inode->ino; + + local->transaction.fop = afr_chmod_wind; + local->transaction.done = afr_chmod_done; + local->transaction.unwind = afr_chmod_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + + +/* {{{ fchmod */ + +int +afr_fchmod_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.fchmod.buf.st_ino = local->cont.fchmod.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.fchmod.buf); + } + return 0; +} + + +int +afr_fchmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.fchmod.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + afr_fchmod_unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_fchmod_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int i = 0; + int call_count = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_fchmod_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fchmod, + local->fd, + local->cont.fchmod.mode); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_fchmod_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int32_t +afr_fchmod (call_frame_t *frame, xlator_t *this, + fd_t *fd, mode_t mode) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->cont.fchmod.mode = mode; + local->cont.fchmod.ino = fd->inode->ino; + + local->transaction.fop = afr_fchmod_wind; + local->transaction.done = afr_fchmod_done; + local->transaction.unwind = afr_fchmod_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ chown */ + +int +afr_chown_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.chown.buf.st_ino = local->cont.chown.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.chown.buf); + } + return 0; +} + + +int +afr_chown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.chown.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) { + local->transaction.unwind (frame, this); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_chown_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_chown_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->chown, + &local->loc, local->cont.chown.uid, + local->cont.chown.gid); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_chown_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_chown (call_frame_t *frame, xlator_t *this, + loc_t *loc, uid_t uid, gid_t gid) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->cont.chown.uid = uid; + local->cont.chown.gid = gid; + local->cont.chown.ino = loc->inode->ino; + + local->transaction.fop = afr_chown_wind; + local->transaction.done = afr_chown_done; + local->transaction.unwind = afr_chown_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + +/* }}} */ + +/* {{{ chown */ + +int +afr_fchown_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.fchown.buf.st_ino = local->cont.fchown.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.fchown.buf); + } + return 0; +} + + +int +afr_fchown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.fchown.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) { + local->transaction.unwind (frame, this); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_fchown_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_fchown_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fchown, + local->fd, local->cont.fchown.uid, + local->cont.fchown.gid); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_fchown_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_fchown (call_frame_t *frame, xlator_t *this, + fd_t *fd, uid_t uid, gid_t gid) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->cont.fchown.uid = uid; + local->cont.fchown.gid = gid; + local->cont.fchown.ino = fd->inode->ino; + + local->transaction.fop = afr_fchown_wind; + local->transaction.done = afr_fchown_done; + local->transaction.unwind = afr_fchown_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ writev */ + +int +afr_writev_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.writev.buf.st_ino = local->cont.writev.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.writev.buf); + } + return 0; +} + + +int +afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.writev.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_writev_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int i = 0; + int call_count = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->writev, + local->fd, + local->cont.writev.vector, + local->cont.writev.count, + local->cont.writev.offset); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_writev_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (local->cont.writev.refs) + dict_unref (local->cont.writev.refs); + local->cont.writev.refs = NULL; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op = GF_FOP_WRITE; + local->cont.writev.vector = iov_dup (vector, count); + local->cont.writev.count = count; + local->cont.writev.offset = offset; + local->cont.writev.ino = fd->inode->ino; + + if (frame->root->req_refs) + local->cont.writev.refs = dict_ref (frame->root->req_refs); + + local->transaction.fop = afr_writev_wind; + local->transaction.done = afr_writev_done; + local->transaction.unwind = afr_writev_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + if (fd->flags & O_APPEND) { + local->transaction.start = 0; + local->transaction.len = 0; + } else { + local->transaction.start = offset; + local->transaction.len = iov_length (vector, count); + } + + local->transaction.pending = AFR_DATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + +/* }}} */ + +/* {{{ truncate */ + +int +afr_truncate_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.truncate.buf.st_ino = local->cont.truncate.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.truncate.buf); + } + return 0; +} + + +int +afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.truncate.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_truncate_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->truncate, + &local->loc, + local->cont.truncate.offset); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_truncate_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_truncate (call_frame_t *frame, xlator_t *this, + loc_t *loc, off_t offset) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.truncate.offset = offset; + local->cont.truncate.ino = loc->inode->ino; + + local->transaction.fop = afr_truncate_wind; + local->transaction.done = afr_truncate_done; + local->transaction.unwind = afr_truncate_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = offset; + local->transaction.pending = AFR_DATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + +/* }}} */ + +/* {{{ ftruncate */ + + +int +afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.ftruncate.buf.st_ino = local->cont.ftruncate.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.ftruncate.buf); + } + return 0; +} + + +int +afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.ftruncate.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->ftruncate, + local->fd, local->cont.ftruncate.offset); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_ftruncate_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op = GF_FOP_FTRUNCATE; + local->op_ret = -1; + + local->cont.ftruncate.offset = offset; + local->cont.ftruncate.ino = fd->inode->ino; + + local->transaction.fop = afr_ftruncate_wind; + local->transaction.done = afr_ftruncate_done; + local->transaction.unwind = afr_ftruncate_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = offset; + local->transaction.pending = AFR_DATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ utimens */ + + +int +afr_utimens_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.utimens.buf.st_ino = local->cont.utimens.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.utimens.buf); + } + return 0; +} + + +int +afr_utimens_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 1; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.utimens.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_utimens_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_utimens_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->utimens, + &local->loc, + local->cont.utimens.tv); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_utimens_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_utimens (call_frame_t *frame, xlator_t *this, + loc_t *loc, struct timespec tv[2]) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.utimens.tv[0] = tv[0]; + local->cont.utimens.tv[1] = tv[1]; + + local->cont.utimens.ino = loc->inode->ino; + + local->transaction.fop = afr_utimens_wind; + local->transaction.done = afr_utimens_done; + local->transaction.unwind = afr_utimens_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ setxattr */ + + +int +afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno) + } + return 0; +} + + +int +afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_setxattr_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setxattr, + &local->loc, + local->cont.setxattr.dict, + local->cont.setxattr.flags); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_setxattr_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_setxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *dict, int32_t flags) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.setxattr.dict = dict_ref (dict); + local->cont.setxattr.flags = flags; + + local->transaction.fop = afr_setxattr_wind; + local->transaction.done = afr_setxattr_done; + local->transaction.unwind = afr_setxattr_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ + +/* {{{ removexattr */ + + +int +afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno) + } + return 0; +} + + +int +afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_removexattr_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->removexattr, + &local->loc, + local->cont.removexattr.name); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_removexattr_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.removexattr.name = strdup (name); + + local->transaction.fop = afr_removexattr_wind; + local->transaction.done = afr_removexattr_done; + local->transaction.unwind = afr_removexattr_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h new file mode 100644 index 00000000000..9c0b5cad314 --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-write.h @@ -0,0 +1,63 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __INODE_WRITE_H__ +#define __INODE_WRITE_H__ + +int32_t +afr_chmod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode); + +int32_t +afr_chown (call_frame_t *frame, xlator_t *this, + loc_t *loc, uid_t uid, gid_t gid); + +int +afr_fchown (call_frame_t *frame, xlator_t *this, + fd_t *fd, uid_t uid, gid_t gid); + +int32_t +afr_fchmod (call_frame_t *frame, xlator_t *this, + fd_t *fd, mode_t mode); + +int32_t +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset); + +int32_t +afr_truncate (call_frame_t *frame, xlator_t *this, + loc_t *loc, off_t offset); + +int32_t +afr_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset); + +int32_t +afr_utimens (call_frame_t *frame, xlator_t *this, + loc_t *loc, struct timespec tv[2]); + +int32_t +afr_setxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *dict, int32_t flags); + +int32_t +afr_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name); + +#endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c new file mode 100644 index 00000000000..45d06516965 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -0,0 +1,1073 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include "glusterfs.h" +#include "xlator.h" +#include "byte-order.h" + +#include "afr.h" +#include "afr-transaction.h" +#include "afr-self-heal-common.h" +#include "afr-self-heal.h" + + +/** + * select_source - select a source and return it + * TODO: take into account option 'favorite-child' + */ + +int +afr_sh_select_source (int sources[], int child_count) +{ + int i; + for (i = 0; i < child_count; i++) + if (sources[i]) + return i; + + return -1; +} + + +/** + * sink_count - return number of sinks in sources array + */ + +int +afr_sh_sink_count (int sources[], int child_count) +{ + int i; + int sinks = 0; + for (i = 0; i < child_count; i++) + if (!sources[i]) + sinks++; + return sinks; +} + +int +afr_sh_source_count (int sources[], int child_count) +{ + int i; + int nsource = 0; + + for (i = 0; i < child_count; i++) + if (sources[i]) + nsource++; + return nsource; +} + + +int +afr_sh_supress_errenous_children (int sources[], int child_errno[], + int child_count) +{ + int i = 0; + + for (i = 0; i < child_count; i++) { + if (child_errno[i] && sources[i]) { + sources[i] = 0; + } + } + + return 0; +} + + +int +afr_sh_supress_empty_children (int sources[], dict_t *xattr[], + struct stat *buf, + int child_count, const char *key) +{ + int i = 0; + int32_t *pending = NULL; + int ret = 0; + int all_xattr_missing = 1; + + /* if the file was created by afr with xattrs */ + for (i = 0; i < child_count; i++) { + if (!xattr[i]) + continue; + + ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending)); + if (ret != 0) { + continue; + } + + all_xattr_missing = 0; + break; + } + + if (all_xattr_missing) { + /* supress 0byte files.. this avoids empty file created + by dir selfheal to overwrite the 'good' file */ + for (i = 0; i < child_count; i++) { + if (!buf[i].st_size) + sources[i] = 0; + } + goto out; + } + + + for (i = 0; i < child_count; i++) { + if (!xattr[i]) { + sources[i] = 0; + continue; + } + + ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending)); + if (ret != 0) { + sources[i] = 0; + continue; + } + + if (!pending) { + sources[i] = 0; + continue; + } + } + +out: + return 0; +} + + +void +afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) +{ + afr_private_t * priv = this->private; + + char *buf = NULL; + char *ptr = NULL; + + int i, j; + + /* 10 digits per entry + 1 space + '[' and ']' */ + buf = MALLOC (priv->child_count * 11 + 8); + + for (i = 0; i < priv->child_count; i++) { + ptr = buf; + ptr += sprintf (ptr, "[ "); + for (j = 0; j < priv->child_count; j++) { + ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); + } + ptr += sprintf (ptr, "]"); + gf_log (this->name, GF_LOG_DEBUG, + "pending_matrix: %s", buf); + } + + FREE (buf); +} + + +void +afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[], + int child_count, const char *key) +{ + int i = 0; + int j = 0; + int32_t *pending = NULL; + int ret = -1; + + /* start clean */ + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + pending_matrix[i][j] = 0; + } + } + + for (i = 0; i < child_count; i++) { + if (!xattr[i]) + continue; + + pending = NULL; + + ret = dict_get_ptr (xattr[i], (char *) key, + VOID(&pending)); + if (ret != 0) + continue; + + for (j = 0; j < child_count; j++) { + pending_matrix[i][j] = ntoh32 (pending[j]); + } + } +} + + +/** + * mark_sources: Mark all 'source' nodes and return number of source + * nodes found + */ + +int +afr_sh_mark_sources (int32_t *pending_matrix[], int sources[], int child_count) +{ + int i = 0; + int j = 0; + + int nsources = 0; + + + /* start clean */ + for (i = 0; i < child_count; i++) { + sources[i] = 0; + } + + /* + Let's 'normalize' the pending matrix first, + by disregarding all pending entries that refer + to themselves + */ + for (i = 0; i < child_count; i++) { + pending_matrix[i][i] = 0; + } + + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + if (pending_matrix[j][i]) + break; + } + + if (j == child_count) { + nsources++; + sources[i] = 1; + } + } + + return nsources; +} + + +void +afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[], + int success[], int child_count) +{ + int i = 0; + int j = 0; + + /* start clean */ + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + delta_matrix[i][j] = 0; + } + } + + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + if (!success[j]) + continue; + delta_matrix[i][j] = -pending_matrix[i][j]; + } + } +} + + +int +afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[], + int child_count, const char *key) +{ + int i = 0; + int j = 0; + + int ret = 0; + + int32_t *pending = 0; + + for (i = 0; i < child_count; i++) { + if (!xattr[i]) + continue; + + pending = CALLOC (sizeof (int32_t), child_count); + for (j = 0; j < child_count; j++) { + pending[j] = hton32 (delta_matrix[i][j]); + } + + ret = dict_set_bin (xattr[i], (char *) key, pending, + child_count * sizeof (int32_t)); + } + + return 0; +} + + +int +afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this) +{ + afr_private_t *priv = NULL; + int32_t *pending = NULL; + void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + + int ret = -1; + int i = 0; + + priv = this->private; + + ret = dict_get_ptr (xattr, AFR_METADATA_PENDING, &tmp_pending); + + if (ret != 0) + return 0; + + pending = tmp_pending; + for (i = 0; i < priv->child_count; i++) { + if (i == child_count) + continue; + if (pending[i]) + return 1; + } + + return 0; +} + + +int +afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this) +{ + afr_private_t *priv = NULL; + int32_t *pending = NULL; + void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + + int ret = -1; + int i = 0; + + priv = this->private; + + ret = dict_get_ptr (xattr, AFR_DATA_PENDING, &tmp_pending); + + if (ret != 0) + return 0; + + pending = tmp_pending; + for (i = 0; i < priv->child_count; i++) { + if (i == child_count) + continue; + if (pending[i]) + return 1; + } + + return 0; +} + + +int +afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this) +{ + afr_private_t *priv = NULL; + int32_t *pending = NULL; + void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + + int ret = -1; + int i = 0; + + priv = this->private; + + ret = dict_get_ptr (xattr, AFR_ENTRY_PENDING, &tmp_pending); + + if (ret != 0) + return 0; + + pending = tmp_pending; + for (i = 0; i < priv->child_count; i++) { + if (i == child_count) + continue; + if (pending[i]) + return 1; + } + + return 0; +} + + + +/** + * is_matrix_zero - return true if pending matrix is all zeroes + */ + +int +afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count) +{ + int i, j; + + for (i = 0; i < child_count; i++) + for (j = 0; j < child_count; j++) + if (pending_matrix[i][j]) + return 0; + return 1; +} + + +int +afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + +// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); + memset (sh->buf, 0, sizeof (struct stat) * priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) + dict_unref (sh->xattr[i]); + sh->xattr[i] = NULL; + } + + if (local->govinda_gOvinda) { + gf_log (this->name, GF_LOG_WARNING, + "aborting selfheal of %s", + local->loc.path); + sh->completion_cbk (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to metadata check on %s", + local->loc.path); + afr_self_heal_metadata (frame, this); + } + + return 0; +} + + +int +sh_missing_entries_unlck_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_missing_entries_done (frame, this); + } + + return 0; +} + + +static int +sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_self_heal_t *sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "unlocking %"PRId64"/%s on subvolume %s", + sh->parent_loc.inode->ino, local->loc.name, + priv->children[i]->name); + + STACK_WIND (frame, sh_missing_entries_unlck_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + &sh->parent_loc, local->loc.name, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + + if (!--call_count) + break; + } + } + return 0; +} + + +static int +sh_destroy_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int op_errno, struct stat *stbuf) +{ + STACK_DESTROY (frame->root); + return 0; +} + + +static int +sh_missing_entries_newentry_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *stbuf) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + call_frame_t *chown_frame = NULL; + int call_count = 0; + int child_index = 0; + struct stat *buf = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + buf = &sh->buf[sh->source]; + child_index = (long) cookie; + + if (op_ret == 0) { + chown_frame = copy_frame (frame); + + gf_log (this->name, GF_LOG_DEBUG, + "chown %s to %d %d on subvolume %s", + local->loc.path, buf->st_uid, buf->st_gid, + priv->children[child_index]->name); + + STACK_WIND (chown_frame, sh_destroy_cbk, + priv->children[child_index], + priv->children[child_index]->fops->chown, + &local->loc, + buf->st_uid, buf->st_gid); + } + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + sh_missing_entries_finish (frame, this); + } + + return 0; +} + + +static int +sh_missing_entries_mknod (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int enoent_count = 0; + int call_count = 0; + mode_t st_mode = 0; + dev_t st_dev = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) + if (sh->child_errno[i] == ENOENT) + enoent_count++; + + call_count = enoent_count; + local->call_count = call_count; + + st_mode = sh->buf[sh->source].st_mode; + st_dev = sh->buf[sh->source].st_dev; + + gf_log (this->name, GF_LOG_DEBUG, + "mknod %s mode 0%o on %d subvolumes", + local->loc.path, st_mode, enoent_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i] == ENOENT) { + STACK_WIND_COOKIE (frame, + sh_missing_entries_newentry_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mknod, + &local->loc, st_mode, st_dev); + if (!--call_count) + break; + } + } + + return 0; +} + + +static int +sh_missing_entries_mkdir (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int enoent_count = 0; + int call_count = 0; + mode_t st_mode = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) + if (sh->child_errno[i] == ENOENT) + enoent_count++; + + call_count = enoent_count; + local->call_count = call_count; + + st_mode = sh->buf[sh->source].st_mode; + + gf_log (this->name, GF_LOG_DEBUG, + "mkdir %s mode 0%o on %d subvolumes", + local->loc.path, st_mode, enoent_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i] == ENOENT) { + STACK_WIND_COOKIE (frame, + sh_missing_entries_newentry_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mkdir, + &local->loc, st_mode); + if (!--call_count) + break; + } + } + + return 0; +} + + +static int +sh_missing_entries_symlink (call_frame_t *frame, xlator_t *this, + const char *link) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int enoent_count = 0; + int call_count = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) + if (sh->child_errno[i] == ENOENT) + enoent_count++; + + call_count = enoent_count; + local->call_count = call_count; + + gf_log (this->name, GF_LOG_DEBUG, + "symlink %s -> %s on %d subvolumes", + local->loc.path, link, enoent_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i] == ENOENT) { + STACK_WIND_COOKIE (frame, + sh_missing_entries_newentry_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->symlink, + link, &local->loc); + if (!--call_count) + break; + } + } + + return 0; +} + + +static int +sh_missing_entries_readlink_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + const char *link) +{ + if (op_ret > 0) + sh_missing_entries_symlink (frame, this, link); + else + sh_missing_entries_finish (frame, this); + + return 0; +} + + +static int +sh_missing_entries_readlink (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + STACK_WIND (frame, sh_missing_entries_readlink_cbk, + priv->children[sh->source], + priv->children[sh->source]->fops->readlink, + &local->loc, 4096); + + return 0; +} + + +static int +sh_missing_entries_create (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int type = 0; + int i = 0; + afr_private_t *priv = NULL; + int enoent_count = 0; + int govinda_gOvinda = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i]) { + if (sh->child_errno[i] == ENOENT) + enoent_count++; + } else { + if (type) { + if (type != (sh->buf[i].st_mode & S_IFMT)) + govinda_gOvinda = 1; + } else { + sh->source = i; + type = sh->buf[i].st_mode & S_IFMT; + } + } + } + + if (govinda_gOvinda) { + gf_log (this->name, GF_LOG_ERROR, + "conflicing filetypes exist for path %s. returning.", + local->loc.path); + + local->govinda_gOvinda = 1; + sh_missing_entries_finish (frame, this); + return 0; + } + + if (!type) { + gf_log (this->name, GF_LOG_ERROR, + "no source found for %s. all nodes down?. returning.", + local->loc.path); + /* subvolumes down and/or file does not exist */ + sh_missing_entries_finish (frame, this); + return 0; + } + + if (enoent_count == 0) { + gf_log (this->name, GF_LOG_ERROR, + "no missing files - %s. proceeding to metadata check", + local->loc.path); + /* proceed to next step - metadata self-heal */ + sh_missing_entries_finish (frame, this); + return 0; + } + + switch (type) { + case S_IFSOCK: + case S_IFREG: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + sh_missing_entries_mknod (frame, this); + break; + case S_IFLNK: + sh_missing_entries_readlink (frame, this); + break; + case S_IFDIR: + sh_missing_entries_mkdir (frame, this); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "unknown file type: 0%o", type); + local->govinda_gOvinda = 1; + sh_missing_entries_finish (frame, this); + } + + return 0; +} + + +static int +sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + int child_index = 0; + afr_local_t *local = NULL; + int call_count = 0; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "path %s on subvolume %s is of mode 0%o", + local->loc.path, + priv->children[child_index]->name, + buf->st_mode); + + local->self_heal.buf[child_index] = *buf; + } else { + gf_log (this->name, GF_LOG_WARNING, + "path %s on subvolume %s => -1 (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + + local->self_heal.child_errno[child_index] = op_errno; + } + + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + sh_missing_entries_create (frame, this); + } + + return 0; +} + + +static int +sh_missing_entries_lookup (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + dict_t *xattr_req = NULL; + int ret = -1; + + local = frame->local; + call_count = local->child_count; + priv = this->private; + + local->call_count = call_count; + + xattr_req = dict_new(); + + if (xattr_req) + ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING, + priv->child_count * sizeof(int32_t)); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, + sh_missing_entries_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, xattr_req); + + if (!--call_count) + break; + } + } + + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + +static int +sh_missing_entries_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + sh->op_failed = 1; + + gf_log (this->name, + (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), + "locking inode of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed == 1) { + sh_missing_entries_finish (frame, this); + return 0; + } + + sh_missing_entries_lookup (frame, this); + } + + return 0; +} + + +static int +afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + gf_log (this->name, GF_LOG_DEBUG, + "attempting to recreate missing entries for path=%s", + local->loc.path); + + afr_build_parent_loc (&sh->parent_loc, &local->loc); + + call_count = local->child_count; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, sh_missing_entries_lk_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + &sh->parent_loc, local->loc.name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_self_heal (call_frame_t *frame, xlator_t *this, + int (*completion_cbk) (call_frame_t *, xlator_t *)) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + gf_log (this->name, GF_LOG_DEBUG, + "performing self heal on %s (metadata=%d data=%d entry=%d)", + local->loc.path, + local->need_metadata_self_heal, + local->need_data_self_heal, + local->need_entry_self_heal); + + sh->completion_cbk = completion_cbk; + + sh->buf = CALLOC (priv->child_count, sizeof (struct stat)); + sh->child_errno = CALLOC (priv->child_count, sizeof (int)); + sh->success = CALLOC (priv->child_count, sizeof (int)); + sh->xattr = CALLOC (priv->child_count, sizeof (dict_t *)); + sh->sources = CALLOC (sizeof (*sh->sources), priv->child_count); + + sh->pending_matrix = CALLOC (sizeof (int32_t *), priv->child_count); + for (i = 0; i < priv->child_count; i++) { + sh->pending_matrix[i] = CALLOC (sizeof (int32_t), + priv->child_count); + } + + sh->delta_matrix = CALLOC (sizeof (int32_t *), priv->child_count); + for (i = 0; i < priv->child_count; i++) { + sh->delta_matrix[i] = CALLOC (sizeof (int32_t), + priv->child_count); + } + + if (local->success_count && local->enoent_count) { + afr_self_heal_missing_entries (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to metadata check on %s", + local->loc.path); + afr_sh_missing_entries_done (frame, this); + } + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h new file mode 100644 index 00000000000..9dd597f0787 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -0,0 +1,66 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __AFR_SELF_HEAL_COMMON_H__ +#define __AFR_SELF_HEAL_COMMON_H__ + +#define FILE_HAS_HOLES(buf) (((buf)->st_size) > ((buf)->st_blocks * 512)) + +int +afr_sh_select_source (int sources[], int child_count); + +int +afr_sh_sink_count (int sources[], int child_count); + +int +afr_sh_source_count (int sources[], int child_count); + +int +afr_sh_supress_errenous_children (int sources[], int child_errno[], + int child_count); + +int +afr_sh_supress_empty_children (int sources[], dict_t *xattr[], + struct stat *buf, + int child_count, const char *key); + +void +afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); + +void +afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[], + int child_count, const char *key); + +void +afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[], + int32_t success[], int child_count); + +int +afr_sh_mark_sources (int32_t *pending_matrix[], int sources[], + int child_count); + +int +afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[], + int child_count, const char *key); + +int +afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count); + + +#endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c new file mode 100644 index 00000000000..3a48da48587 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -0,0 +1,1030 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + + + +int +afr_sh_data_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + /* + TODO: cleanup sh->* + */ + + gf_log (this->name, GF_LOG_DEBUG, + "self heal of %s completed", + local->loc.path); + + sh->completion_cbk (frame, this); + + return 0; +} + + +int +afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + fd_unref (sh->healing_fd); + sh->healing_fd = NULL; + afr_sh_data_done (frame, this); + } + + return 0; +} + + +int +afr_sh_data_close (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int i = 0; + int call_count = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + if (!sh->healing_fd) { + afr_sh_data_done (frame, this); + return 0; + } + + call_count = sh->active_sinks + 1; + local->call_count = call_count; + + + /* closed source */ + gf_log (this->name, GF_LOG_DEBUG, + "closing fd of %s on %s", + local->loc.path, priv->children[sh->source]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, + (void *) (long) sh->source, + priv->children[sh->source], + priv->children[sh->source]->fops->flush, + sh->healing_fd); + call_count--; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] || !local->child_up[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "closing fd of %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->flush, + sh->healing_fd); + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + int call_count = 0; + int child_index = (long) cookie; + + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "locking inode of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_data_close (frame, this); + } + + return 0; +} + + +int +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this) +{ + struct flock flock; + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t * sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + flock.l_start = 0; + flock.l_len = 0; + flock.l_type = F_UNLCK; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "unlocking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_unlck_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_sh_data_finish (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + gf_log (this->name, GF_LOG_DEBUG, + "finishing data selfheal of %s", local->loc.path); + + afr_sh_data_unlock (frame, this); + + return 0; +} + + +int +afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_data_finish (frame, this); + + return 0; +} + + +int +afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t **erase_xattr = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + + afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix, + sh->success, priv->child_count); + + erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + call_count++; + + erase_xattr[i] = get_new_dict(); + dict_ref (erase_xattr[i]); + } + } + + afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr, + priv->child_count, AFR_DATA_PENDING); + + local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (!erase_xattr[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "erasing pending flags from %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, erase_xattr[i]); + if (!--call_count) + break; + } + + for (i = 0; i < priv->child_count; i++) { + if (erase_xattr[i]) { + dict_unref (erase_xattr[i]); + } + } + FREE (erase_xattr); + + return 0; +} + + +int +afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) + gf_log (this->name, GF_LOG_ERROR, + "ftruncate of %s on subvolume %s failed (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + else + gf_log (this->name, GF_LOG_DEBUG, + "ftruncate of %s on subvolume %s completed", + local->loc.path, + priv->children[child_index]->name); + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_data_erase_pending (frame, this); + } + + return 0; +} + + +int +afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + int *sources = NULL; + int call_count = 0; + int i = 0; + + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + sources = sh->sources; + call_count = sh->active_sinks; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (sources[i] || !local->child_up[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->ftruncate, + sh->healing_fd, sh->file_size); + + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this); + +int +afr_sh_data_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + + int child_index = (long) cookie; + int call_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + gf_log (this->name, GF_LOG_DEBUG, + "wrote %d bytes of data from %s to child %d, offset %"PRId64"", + op_ret, local->loc.path, child_index, sh->offset - op_ret); + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "write to %s failed on subvolume %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->op_failed = 1; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_data_read_write_iter (frame, this); + } + + return 0; +} + + +int +afr_sh_data_read_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + + int child_index = (long) cookie; + int i = 0; + int call_count = 0; + + off_t offset; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + call_count = sh->active_sinks; + + local->call_count = call_count; + + gf_log (this->name, GF_LOG_DEBUG, + "read %d bytes of data from %s on child %d, offset %"PRId64"", + op_ret, local->loc.path, child_index, sh->offset); + + if (op_ret <= 0) { + afr_sh_data_trim_sinks (frame, this); + return 0; + } + + /* what if we read less than block size? */ + offset = sh->offset; + sh->offset += op_ret; + + frame->root->req_refs = frame->root->rsp_refs; + + if (sh->file_has_holes) { + if (iov_0filled (vector, count) == 0) { + /* the iter function depends on the + sh->offset already being updated + above + */ + afr_sh_data_read_write_iter (frame, this); + goto out; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] || !local->child_up[i]) + continue; + + /* this is a sink, so write to it */ + STACK_WIND_COOKIE (frame, afr_sh_data_write_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->writev, + sh->healing_fd, vector, count, offset); + + if (!--call_count) + break; + } + +out: + return 0; +} + + +int +afr_sh_data_read_write (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + STACK_WIND_COOKIE (frame, afr_sh_data_read_cbk, + (void *) (long) sh->source, + priv->children[sh->source], + priv->children[sh->source]->fops->readv, + sh->healing_fd, sh->block_size, + sh->offset); + + return 0; +} + + +int +afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + if (sh->op_failed) { + afr_sh_data_finish (frame, this); + goto out; + } + + if (sh->offset >= sh->file_size) { + gf_log (this->name, GF_LOG_DEBUG, + "closing fd's of %s", + local->loc.path); + afr_sh_data_trim_sinks (frame, this); + + goto out; + } + + afr_sh_data_read_write (frame, this); + +out: + return 0; +} + + +int +afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + /* TODO: some of the open's might fail. + In that case, modify cleanup fn to send flush on those + fd's which are already open */ + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "open of %s failed on child %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->op_failed = 1; + } + + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed) { + afr_sh_data_finish (frame, this); + return 0; + } + gf_log (this->name, GF_LOG_DEBUG, + "fd for %s opened, commencing sync", + local->loc.path); + + gf_log (this->name, GF_LOG_WARNING, + "sourcing file %s from %s to other sinks", + local->loc.path, priv->children[sh->source]->name); + + afr_sh_data_read_write (frame, this); + } + + return 0; +} + + +int +afr_sh_data_open (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + + int source = -1; + int *sources = NULL; + + fd_t *fd = NULL; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = sh->active_sinks + 1; + local->call_count = call_count; + + fd = fd_create (local->loc.inode, frame->root->pid); + sh->healing_fd = fd; + + source = local->self_heal.source; + sources = local->self_heal.sources; + + sh->block_size = 65536; + sh->file_size = sh->buf[source].st_size; + + if (FILE_HAS_HOLES (&sh->buf[source])) + sh->file_has_holes = 1; + + /* open source */ + STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, + (void *) (long) source, + priv->children[source], + priv->children[source]->fops->open, + &local->loc, O_RDONLY|O_LARGEFILE, fd); + call_count--; + + /* open sinks */ + for (i = 0; i < priv->child_count; i++) { + if(sources[i] || !local->child_up[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->open, + &local->loc, + O_WRONLY|O_LARGEFILE, fd); + + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int active_sinks = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] == 0 && local->child_up[i] == 1) { + active_sinks++; + sh->success[i] = 1; + } + } + sh->success[source] = 1; + + if (active_sinks == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sinks for performing self-heal on file %s", + local->loc.path); + afr_sh_data_finish (frame, this); + return 0; + } + sh->active_sinks = active_sinks; + + gf_log (this->name, GF_LOG_DEBUG, + "syncing data of %s from subvolume %s to %d active sinks", + local->loc.path, priv->children[source]->name, active_sinks); + + afr_sh_data_open (frame, this); + + return 0; +} + + +int +afr_sh_data_fix (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int nsources = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr, + priv->child_count, AFR_DATA_PENDING); + + afr_sh_print_pending_matrix (sh->pending_matrix, this); + + + afr_sh_mark_sources (sh->pending_matrix, sh->sources, + priv->child_count); + + afr_sh_supress_empty_children (sh->sources, sh->xattr, sh->buf, + priv->child_count, AFR_DATA_PENDING); + + afr_sh_supress_errenous_children (sh->sources, sh->child_errno, + priv->child_count); + + nsources = afr_sh_source_count (sh->sources, priv->child_count); + + if ((nsources == 0) + && (priv->favorite_child != -1) + && (sh->child_errno[priv->favorite_child] == 0)) { + + gf_log (this->name, GF_LOG_WARNING, + "Picking favorite child %s as authentic source to resolve conflicting data of %s", + priv->children[priv->favorite_child]->name, + local->loc.path); + + sh->sources[priv->favorite_child] = 1; + + nsources = afr_sh_source_count (sh->sources, + priv->child_count); + } + + if (nsources == 0) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to resolve conflicting data of %s. " + "Please resolve manually by deleting the file %s " + "from all but the preferred subvolume. " + "Please consider 'option favorite-child <>'", + local->loc.path, local->loc.path); + + local->govinda_gOvinda = 1; + + afr_sh_data_finish (frame, this); + return 0; + } + + source = afr_sh_select_source (sh->sources, priv->child_count); + sh->source = source; + + /* detect changes not visible through pending flags -- JIC */ + for (i = 0; i < priv->child_count; i++) { + if (i == source || sh->child_errno[i]) + continue; + + if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source])) + sh->sources[i] = 0; + } + + afr_sh_data_sync_prepare (frame, this); + + return 0; +} + + +int +afr_sh_data_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + int call_count = -1; + int child_index = (long) cookie; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + sh->xattr[child_index] = dict_ref (xattr); + sh->buf[child_index] = *buf; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_data_fix (frame, this); + } + + return 0; +} + + +int +afr_sh_data_lookup (call_frame_t *frame, xlator_t *this) +{ + afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xattr_req = NULL; + + int call_count = 0; + int i = 0; + int ret = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + call_count = local->child_count; + + local->call_count = call_count; + + xattr_req = dict_new(); + if (xattr_req) + ret = dict_set_uint64 (xattr_req, AFR_DATA_PENDING, + priv->child_count * sizeof(int32_t)); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_sh_data_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, xattr_req); + if (!--call_count) + break; + } + } + + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + +int +afr_sh_data_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + /* TODO: what if lock fails? */ + + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + sh->op_failed = 1; + + gf_log (this->name, + (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), + "locking of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed) { + afr_sh_data_finish (frame, this); + return 0; + } + + afr_sh_data_lookup (frame, this); + } + + return 0; +} + + +int +afr_sh_data_lock (call_frame_t *frame, xlator_t *this) +{ + struct flock flock; + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t * sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + flock.l_start = 0; + flock.l_len = 0; + flock.l_type = F_WRLCK; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "locking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_lock_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_self_heal_data (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = this->private; + + + local = frame->local; + sh = &local->self_heal; + + if (local->need_data_self_heal && priv->data_self_heal) { + afr_sh_data_lock (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "not doing data self heal on %s", + local->loc.path); + afr_sh_data_done (frame, this); + } + + return 0; +} + diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c new file mode 100644 index 00000000000..ec341922ee7 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -0,0 +1,2038 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + + + +int +afr_sh_entry_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + /* + TODO: cleanup sh->* + */ + + gf_log (this->name, GF_LOG_DEBUG, + "self heal of %s completed", + local->loc.path); + + sh->completion_cbk (frame, this); + + return 0; +} + + +int +afr_sh_entry_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + /* TODO: what if lock fails? */ + + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "unlocking inode of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "unlocked inode of %s on child %d", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->healing_fd) + fd_unref (sh->healing_fd); + sh->healing_fd = NULL; + afr_sh_entry_done (frame, this); + } + + return 0; +} + + +int +afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t * sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "unlocking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_entry_unlck_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->entrylk, + &local->loc, NULL, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_sh_entry_finish (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + gf_log (this->name, GF_LOG_DEBUG, + "finishing entry selfheal of %s", local->loc.path); + + afr_sh_entry_unlock (frame, this); + + return 0; +} + + +int +afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_entry_finish (frame, this); + + return 0; +} + + +int +afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t **erase_xattr = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + + afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix, + sh->success, priv->child_count); + + erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + call_count++; + + erase_xattr[i] = get_new_dict(); + dict_ref (erase_xattr[i]); + } + } + + afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr, + priv->child_count, AFR_ENTRY_PENDING); + + local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (!erase_xattr[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "erasing pending flags from %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, erase_xattr[i]); + if (!--call_count) + break; + } + + for (i = 0; i < priv->child_count; i++) { + if (erase_xattr[i]) { + dict_unref (erase_xattr[i]); + } + } + FREE (erase_xattr); + + return 0; +} + + + +static int +next_active_source (call_frame_t *frame, xlator_t *this, + int current_active_source) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int source = -1; + int next_active_source = -1; + int i = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + source = sh->source; + + if (source != -1) { + if (current_active_source != source) + next_active_source = source; + goto out; + } + + /* + the next active sink becomes the source for the + 'conservative decision' of merging all entries + */ + + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) + && (local->child_up[i] == 1) + && (i > current_active_source)) { + + next_active_source = i; + break; + } + } +out: + return next_active_source; +} + + + +static int +next_active_sink (call_frame_t *frame, xlator_t *this, + int current_active_sink) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int next_active_sink = -1; + int i = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + /* + the next active sink becomes the source for the + 'conservative decision' of merging all entries + */ + + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) + && (local->child_up[i] == 1) + && (i > current_active_sink)) { + + next_active_sink = i; + break; + } + } + + return next_active_sink; +} + + +int +build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +{ + int ret = -1; + + if (!child) { + goto out; + } + + if (strcmp (parent->path, "/") == 0) + asprintf ((char **)&child->path, "/%s", name); + else + asprintf ((char **)&child->path, "%s/%s", parent->path, name); + + if (!child->path) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + child->name = strrchr (child->path, '/'); + if (child->name) + child->name++; + + child->parent = inode_ref (parent->inode); + child->inode = inode_new (parent->inode->table); + + if (!child->inode) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ret = 0; +out: + if (ret == -1) + loc_wipe (child); + + return ret; +} + + +int +afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this); + +int +afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, + int active_src); + +int +afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_entry_expunge_subvol (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int active_src = 0; + call_frame_t *frame = NULL; + + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + + active_src = (long) cookie; + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "removed %s on %s", + expunge_local->loc.path, + priv->children[active_src]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "removing %s on %s failed (%s)", + expunge_local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + } + + AFR_STACK_DESTROY (expunge_frame); + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + + priv = this->private; + expunge_local = expunge_frame->local; + + gf_log (this->name, GF_LOG_WARNING, + "removing directory %s on %s", + expunge_local->loc.path, priv->children[active_src]->name); + + STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->rmdir, + &expunge_local->loc); + + return 0; +} + + +int +afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + + priv = this->private; + expunge_local = expunge_frame->local; + + gf_log (this->name, GF_LOG_WARNING, + "unlinking file %s on %s", + expunge_local->loc.path, priv->children[active_src]->name); + + STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->unlink, + &expunge_local->loc); + + return 0; +} + + +int +afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, + int active_src, struct stat *buf) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int source = 0; + call_frame_t *frame = NULL; + int type = 0; + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + source = expunge_sh->source; + + type = (buf->st_mode & S_IFMT); + + switch (type) { + case S_IFSOCK: + case S_IFREG: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + case S_IFLNK: + afr_sh_entry_expunge_unlink (expunge_frame, this, active_src); + + break; + case S_IFDIR: + afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "%s has unknown file type on %s: 0%o", + expunge_local->loc.path, + priv->children[source]->name, type); + goto out; + break; + } + + return 0; +out: + AFR_STACK_DESTROY (expunge_frame); + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *x) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + active_src = (long) cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "lookup of %s on %s failed (%s)", + expunge_local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + goto out; + } + + afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf); + + return 0; +out: + AFR_STACK_DESTROY (expunge_frame); + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + + priv = this->private; + expunge_local = expunge_frame->local; + + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on %s", + expunge_local->loc.path, priv->children[active_src]->name); + + STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->lookup, + &expunge_local->loc, 0); + + return 0; +} + + +int +afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *x) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int source = 0; + call_frame_t *frame = NULL; + int active_src = 0; + + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + active_src = expunge_sh->active_source; + source = (long) cookie; + + if (op_ret == -1 && op_errno == ENOENT) { + + gf_log (this->name, GF_LOG_DEBUG, + "missing entry %s on %s", + expunge_local->loc.path, + priv->children[source]->name); + + afr_sh_entry_expunge_purge (expunge_frame, this, active_src); + + return 0; + } + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "%s exists under %s", + expunge_local->loc.path, + priv->children[source]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "looking up %s under %s failed (%s)", + expunge_local->loc.path, + priv->children[source]->name, + strerror (op_errno)); + } + + AFR_STACK_DESTROY (expunge_frame); + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, + char *name) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int ret = -1; + call_frame_t *expunge_frame = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int active_src = 0; + int source = 0; + int op_errno = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + source = sh->source; + + if ((strcmp (name, ".") == 0) + || (strcmp (name, "..") == 0)) { + gf_log (this->name, GF_LOG_DEBUG, + "skipping inspection of %s under %s", + name, local->loc.path); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "inspecting existance of %s under %s", + name, local->loc.path); + + expunge_frame = copy_frame (frame); + if (!expunge_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (expunge_local, afr_local_t, out); + + expunge_frame->local = expunge_local; + expunge_sh = &expunge_local->self_heal; + expunge_sh->sh_frame = frame; + expunge_sh->active_source = active_src; + + ret = build_child_loc (this, &expunge_local->loc, &local->loc, name); + if (ret != 0) { + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on %s", expunge_local->loc.path, + priv->children[source]->name); + + STACK_WIND_COOKIE (expunge_frame, + afr_sh_entry_expunge_entry_cbk, + (void *) (long) source, + priv->children[source], + priv->children[source]->fops->lookup, + &expunge_local->loc, 0); + + ret = 0; +out: + if (ret == -1) + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + gf_dirent_t *entries) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + gf_dirent_t *entry = NULL; + off_t last_offset = 0; + int active_src = 0; + int entry_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + + if (op_ret <= 0) { + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "readdir of %s on subvolume %s failed (%s)", + local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "readdir of %s on subvolume %s complete", + local->loc.path, + priv->children[active_src]->name); + } + + afr_sh_entry_expunge_all (frame, this); + return 0; + } + + list_for_each_entry (entry, &entries->list, list) { + last_offset = entry->d_off; + entry_count++; + } + + gf_log (this->name, GF_LOG_DEBUG, + "readdir'ed %d entries from %s", + entry_count, priv->children[active_src]->name); + + sh->offset = last_offset; + local->call_count = entry_count; + + list_for_each_entry (entry, &entries->list, list) { + afr_sh_entry_expunge_entry (frame, this, entry->d_name); + } + + return 0; +} + +int +afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, + priv->children[active_src], + priv->children[active_src]->fops->readdir, + sh->healing_fd, sh->block_size, sh->offset); + + return 0; +} + + +int +afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int active_src = -1; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + sh->offset = 0; + + if (sh->source == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sources for %s to expunge entries", + local->loc.path); + goto out; + } + + active_src = next_active_sink (frame, this, sh->active_source); + sh->active_source = active_src; + + if (sh->op_failed) { + goto out; + } + + if (active_src == -1) { + /* completed creating missing files on all subvolumes */ + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "expunging entries of %s on %s to other sinks", + local->loc.path, priv->children[active_src]->name); + + afr_sh_entry_expunge_subvol (frame, this, active_src); + + return 0; +out: + afr_sh_entry_erase_pending (frame, this); + return 0; + +} + + +int +afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); + +int +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, + int active_src); + +int +afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_entry_impunge_subvol (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_impunge_utimens_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct stat *stbuf) +{ + int call_count = 0; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + int child_index = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + child_index = (long) cookie; + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "utimes set for %s on %s", + impunge_local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "setting utimes of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + } + + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_chown_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct stat *stbuf) +{ + int call_count = 0; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + int child_index = 0; + struct timespec ts[2]; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + child_index = (long) cookie; + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "ownership of %s on %s changed", + impunge_local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "setting ownership of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + goto out; + } + +#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC + ts[0] = impunge_local->cont.lookup.buf.st_atim; + ts[1] = impunge_local->cont.lookup.buf.st_mtim; +#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC + ts[0] = impunge_local->cont.lookup.buf.st_atimespec; + ts[1] = impunge_local->cont.lookup.buf.st_mtimespec; +#else + ts[0].tv_sec = impunge_local->cont.lookup.buf.st_atime; + ts[1].tv_sec = impunge_local->cont.lookup.buf.st_mtime; +#endif + STACK_WIND_COOKIE (impunge_frame, + afr_sh_entry_impunge_utimens_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->utimens, + &impunge_local->loc, ts); + + return 0; + +out: + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *stbuf) +{ + int call_count = 0; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + int child_index = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + + child_index = (long) cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "creation of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "setting ownership of %s on %s to %d/%d", + impunge_local->loc.path, + priv->children[child_index]->name, + impunge_local->cont.lookup.buf.st_uid, + impunge_local->cont.lookup.buf.st_gid); + + inode->st_mode = stbuf->st_mode; + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_chown_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->chown, + &impunge_local->loc, + impunge_local->cont.lookup.buf.st_uid, + impunge_local->cont.lookup.buf.st_gid); + return 0; + +out: + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, + int child_index, struct stat *stbuf) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + gf_log (this->name, GF_LOG_WARNING, + "creating file %s mode=0%o dev=0x%"GF_PRI_DEV" on %s", + impunge_local->loc.path, + stbuf->st_mode, stbuf->st_rdev, + priv->children[child_index]->name); + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->mknod, + &impunge_local->loc, + stbuf->st_mode, stbuf->st_rdev); + + return 0; +} + + + +int +afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, + int child_index, struct stat *stbuf) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + gf_log (this->name, GF_LOG_WARNING, + "creating directory %s mode=0%o on %s", + impunge_local->loc.path, + stbuf->st_mode, + priv->children[child_index]->name); + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->mkdir, + &impunge_local->loc, stbuf->st_mode); + + return 0; +} + + +int +afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, + int child_index, const char *linkname) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + gf_log (this->name, GF_LOG_WARNING, + "creating symlink %s -> %s on %s", + impunge_local->loc.path, linkname, + priv->children[child_index]->name); + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->symlink, + linkname, &impunge_local->loc); + + return 0; +} + + +int +afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + const char *linkname) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int child_index = -1; + call_frame_t *frame = NULL; + int call_count = -1; + int active_src = -1; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + active_src = impunge_sh->active_source; + + child_index = (long) cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "readlink of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + goto out; + } + + afr_sh_entry_impunge_symlink (impunge_frame, this, child_index, + linkname); + return 0; + +out: + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this, + int child_index, struct stat *stbuf) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = -1; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + active_src = impunge_sh->active_source; + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, + (void *) (long) child_index, + priv->children[active_src], + priv->children[active_src]->fops->readlink, + &impunge_local->loc, 4096); + + return 0; +} + + +int +afr_sh_entry_impunge_recreate_lookup_cbk (call_frame_t *impunge_frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, + dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = 0; + int type = 0; + int child_index = 0; + call_frame_t *frame = NULL; + int call_count = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + + child_index = (long) cookie; + + active_src = impunge_sh->active_source; + + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "looking up %s on %s (for %s) failed (%s)", + impunge_local->loc.path, + priv->children[active_src]->name, + priv->children[child_index]->name, + strerror (op_errno)); + goto out; + } + + impunge_local->cont.lookup.buf = *buf; + type = (buf->st_mode & S_IFMT); + + switch (type) { + case S_IFSOCK: + case S_IFREG: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + afr_sh_entry_impunge_mknod (impunge_frame, this, + child_index, buf); + break; + case S_IFLNK: + afr_sh_entry_impunge_readlink (impunge_frame, this, + child_index, buf); + break; + case S_IFDIR: + afr_sh_entry_impunge_mkdir (impunge_frame, this, + child_index, buf); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "%s has unknown file type on %s: 0%o", + impunge_local->loc.path, + priv->children[active_src]->name, type); + goto out; + break; + } + + return 0; + +out: + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_recreate (call_frame_t *impunge_frame, xlator_t *this, + int child_index) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = 0; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + active_src = impunge_sh->active_source; + + STACK_WIND_COOKIE (impunge_frame, + afr_sh_entry_impunge_recreate_lookup_cbk, + (void *) (long) child_index, + priv->children[active_src], + priv->children[active_src]->fops->lookup, + &impunge_local->loc, 0); + + return 0; +} + + +int +afr_sh_entry_impunge_entry_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *x) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int call_count = 0; + int child_index = 0; + call_frame_t *frame = NULL; + int active_src = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + child_index = (long) cookie; + active_src = impunge_sh->active_source; + + if (op_ret == -1 && op_errno == ENOENT) { + /* decrease call_count in recreate-callback */ + gf_log (this->name, GF_LOG_DEBUG, + "missing entry %s on %s", + impunge_local->loc.path, + priv->children[child_index]->name); + + afr_sh_entry_impunge_recreate (impunge_frame, this, + child_index); + return 0; + } + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "%s exists under %s", + impunge_local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "looking up %s under %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + } + + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, + char *name) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int ret = -1; + call_frame_t *impunge_frame = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = 0; + int i = 0; + int call_count = 0; + int op_errno = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + + if ((strcmp (name, ".") == 0) + || (strcmp (name, "..") == 0)) { + gf_log (this->name, GF_LOG_DEBUG, + "skipping inspection of %s under %s", + name, local->loc.path); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "inspecting existance of %s under %s", + name, local->loc.path); + + impunge_frame = copy_frame (frame); + if (!impunge_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (impunge_local, afr_local_t, out); + + impunge_frame->local = impunge_local; + impunge_sh = &impunge_local->self_heal; + impunge_sh->sh_frame = frame; + impunge_sh->active_source = active_src; + + ret = build_child_loc (this, &impunge_local->loc, &local->loc, name); + if (ret != 0) { + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (i == active_src) + continue; + if (local->child_up[i] == 0) + continue; + if (sh->sources[i] == 1) + continue; + call_count++; + } + + impunge_local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (i == active_src) + continue; + if (local->child_up[i] == 0) + continue; + if (sh->sources[i] == 1) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on %s", impunge_local->loc.path, + priv->children[i]->name); + + STACK_WIND_COOKIE (impunge_frame, + afr_sh_entry_impunge_entry_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &impunge_local->loc, 0); + + if (!--call_count) + break; + } + + ret = 0; +out: + if (ret == -1) + afr_sh_entry_impunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + gf_dirent_t *entries) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + gf_dirent_t *entry = NULL; + off_t last_offset = 0; + int active_src = 0; + int entry_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + + if (op_ret <= 0) { + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "readdir of %s on subvolume %s failed (%s)", + local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "readdir of %s on subvolume %s complete", + local->loc.path, + priv->children[active_src]->name); + } + + afr_sh_entry_impunge_all (frame, this); + return 0; + } + + list_for_each_entry (entry, &entries->list, list) { + last_offset = entry->d_off; + entry_count++; + } + + gf_log (this->name, GF_LOG_DEBUG, + "readdir'ed %d entries from %s", + entry_count, priv->children[active_src]->name); + + sh->offset = last_offset; + local->call_count = entry_count; + + list_for_each_entry (entry, &entries->list, list) { + afr_sh_entry_impunge_entry (frame, this, entry->d_name); + } + + return 0; +} + + +int +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk, + priv->children[active_src], + priv->children[active_src]->fops->readdir, + sh->healing_fd, sh->block_size, sh->offset); + + return 0; +} + + +int +afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int active_src = -1; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + sh->offset = 0; + + active_src = next_active_source (frame, this, sh->active_source); + sh->active_source = active_src; + + if (sh->op_failed) { + afr_sh_entry_finish (frame, this); + return 0; + } + + if (active_src == -1) { + /* completed creating missing files on all subvolumes */ + afr_sh_entry_expunge_all (frame, this); + return 0; + } + + gf_log (this->name, GF_LOG_DEBUG, + "impunging entries of %s on %s to other sinks", + local->loc.path, priv->children[active_src]->name); + + afr_sh_entry_impunge_subvol (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + /* TODO: some of the open's might fail. + In that case, modify cleanup fn to send flush on those + fd's which are already open */ + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "opendir of %s failed on child %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->op_failed = 1; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed) { + afr_sh_entry_finish (frame, this); + return 0; + } + gf_log (this->name, GF_LOG_DEBUG, + "fd for %s opened, commencing sync", + local->loc.path); + + sh->active_source = -1; + afr_sh_entry_impunge_all (frame, this); + } + + return 0; +} + + +int +afr_sh_entry_open (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + + int source = -1; + int *sources = NULL; + + fd_t *fd = NULL; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = local->self_heal.source; + sources = local->self_heal.sources; + + sh->block_size = 131072; + sh->offset = 0; + + call_count = sh->active_sinks; + if (source != -1) + call_count++; + + local->call_count = call_count; + + fd = fd_create (local->loc.inode, frame->root->pid); + sh->healing_fd = fd; + + if (source != -1) { + gf_log (this->name, GF_LOG_DEBUG, + "opening directory %s on subvolume %s (source)", + local->loc.path, priv->children[source]->name); + + /* open source */ + STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, + (void *) (long) source, + priv->children[source], + priv->children[source]->fops->opendir, + &local->loc, fd); + call_count--; + } + + /* open sinks */ + for (i = 0; i < priv->child_count; i++) { + if (sources[i] || !local->child_up[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "opening directory %s on subvolume %s (sink)", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->opendir, + &local->loc, fd); + + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int active_sinks = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] == 0 && local->child_up[i] == 1) { + active_sinks++; + sh->success[i] = 1; + } + } + if (source != -1) + sh->success[source] = 1; + + if (active_sinks == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sinks for self-heal on dir %s", + local->loc.path); + afr_sh_entry_finish (frame, this); + return 0; + } + if (source == -1 && active_sinks < 2) { + gf_log (this->name, GF_LOG_WARNING, + "cannot sync with 0 sources and 1 sink on dir %s", + local->loc.path); + afr_sh_entry_finish (frame, this); + return 0; + } + sh->active_sinks = active_sinks; + + if (source != -1) + gf_log (this->name, GF_LOG_DEBUG, + "syncing %s from subvolume %s to %d active sinks", + local->loc.path, priv->children[source]->name, + active_sinks); + else + gf_log (this->name, GF_LOG_DEBUG, + "no active sources for %s found. " + "merging all entries as a conservative decision", + local->loc.path); + + afr_sh_entry_open (frame, this); + + return 0; +} + + +int +afr_sh_entry_fix (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int source = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr, + priv->child_count, AFR_ENTRY_PENDING); + + afr_sh_print_pending_matrix (sh->pending_matrix, this); + + + afr_sh_mark_sources (sh->pending_matrix, sh->sources, + priv->child_count); + + afr_sh_supress_errenous_children (sh->sources, sh->child_errno, + priv->child_count); + + source = afr_sh_select_source (sh->sources, priv->child_count); + sh->source = source; + + afr_sh_entry_sync_prepare (frame, this); + + return 0; +} + + + +int +afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + int call_count = -1; + int child_index = (long) cookie; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + sh->xattr[child_index] = dict_ref (xattr); + sh->buf[child_index] = *buf; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_entry_fix (frame, this); + } + + return 0; +} + + + +int +afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this) +{ + afr_self_heal_t * sh = NULL; + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + dict_t *xattr_req = NULL; + int ret = 0; + int call_count = 0; + int i = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + call_count = local->child_count; + + local->call_count = call_count; + + xattr_req = dict_new(); + if (xattr_req) + ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING, + priv->child_count * sizeof(int32_t)); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, + afr_sh_entry_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, xattr_req); + if (!--call_count) + break; + } + } + + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + + +int +afr_sh_entry_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + /* TODO: what if lock fails? */ + + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + sh->op_failed = 1; + + gf_log (this->name, + (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), + "locking inode of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed == 1) { + afr_sh_entry_finish (frame, this); + return 0; + } + + afr_sh_entry_lookup (frame, this); + } + + return 0; +} + + +int +afr_sh_entry_lock (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t * sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "locking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_entry_lock_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->entrylk, + &local->loc, NULL, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_self_heal_entry (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + if (local->need_entry_self_heal && priv->entry_self_heal) { + afr_sh_entry_lock (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to completion on %s", + local->loc.path); + afr_sh_entry_done (frame, this); + } + + return 0; +} + diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c new file mode 100644 index 00000000000..e65a426db6c --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -0,0 +1,791 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + + +int +afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + +// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); + memset (sh->buf, 0, sizeof (struct stat) * priv->child_count); + memset (sh->success, 0, sizeof (int) * priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) + dict_unref (sh->xattr[i]); + sh->xattr[i] = NULL; + } + + if (local->govinda_gOvinda) { + gf_log (this->name, GF_LOG_WARNING, + "aborting selfheal of %s", + local->loc.path); + sh->completion_cbk (frame, this); + } else { + if (S_ISREG (local->cont.lookup.buf.st_mode)) { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to data check on %s", + local->loc.path); + afr_self_heal_data (frame, this); + return 0; + } + + if (S_ISDIR (local->cont.lookup.buf.st_mode)) { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to entry check on %s", + local->loc.path); + afr_self_heal_entry (frame, this); + return 0; + } + gf_log (this->name, GF_LOG_DEBUG, + "completed self heal of %s", + local->loc.path); + + sh->completion_cbk (frame, this); + } + + return 0; +} + + +int +afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + int call_count = 0; + + + local = frame->local; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_metadata_done (frame, this); + + return 0; +} + + +int +afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + struct flock flock = {0, }; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + flock.l_start = 0; + flock.l_len = 0; + flock.l_type = F_UNLCK; + + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "unlocking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND (frame, afr_sh_metadata_unlck_cbk, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_metadata_finish (frame, this); + + return 0; +} + + +int +afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t **erase_xattr = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + + afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix, + sh->success, priv->child_count); + + erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + call_count++; + + erase_xattr[i] = get_new_dict(); + dict_ref (erase_xattr[i]); + } + } + + afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr, + priv->child_count, AFR_METADATA_PENDING); + + local->call_count = call_count; + + if (call_count == 0) { + gf_log (this->name, GF_LOG_WARNING, + "metadata of %s not healed on any subvolume", + local->loc.path); + + afr_sh_metadata_finish (frame, this); + } + + for (i = 0; i < priv->child_count; i++) { + if (!erase_xattr[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "erasing pending flags from %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, erase_xattr[i]); + if (!--call_count) + break; + } + + for (i = 0; i < priv->child_count; i++) { + if (erase_xattr[i]) { + dict_unref (erase_xattr[i]); + } + } + FREE (erase_xattr); + + return 0; +} + + +int +afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "setting attributes failed for %s on %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + + sh->success[child_index] = 0; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_metadata_erase_pending (frame, this); + + return 0; +} + + +int +afr_sh_metadata_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); + + return 0; +} + + +int +afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); + + return 0; +} + + +int +afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int source = 0; + int active_sinks = 0; + int call_count = 0; + int i = 0; + struct timespec ts[2]; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + active_sinks = sh->active_sinks; + + /* + * 4 calls per sink - chown, chmod, utimes, setxattr + */ + if (xattr) + call_count = active_sinks * 4; + else + call_count = active_sinks * 3; + + local->call_count = call_count; + +#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC + ts[0] = sh->buf[source].st_atim; + ts[1] = sh->buf[source].st_mtim; +#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC + ts[0] = sh->buf[source].st_atimespec; + ts[1] = sh->buf[source].st_mtimespec; +#else + ts[0].tv_sec = sh->buf[source].st_atime; + ts[1].tv_sec = sh->buf[source].st_mtime; +#endif + + for (i = 0; i < priv->child_count; i++) { + if (call_count == 0) { + break; + } + if (sh->sources[i] || !local->child_up[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "syncing metadata of %s from %s to %s", + local->loc.path, priv->children[source]->name, + priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->chown, + &local->loc, + sh->buf[source].st_uid, + sh->buf[source].st_gid); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->chmod, + &local->loc, sh->buf[source].st_mode); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->utimens, + &local->loc, ts); + + call_count = call_count - 3; + + if (!xattr) + continue; + + STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setxattr, + &local->loc, xattr, 0); + call_count--; + } + + return 0; +} + + +int +afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int source = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "getxattr of %s failed on subvolume %s (%s). proceeding without xattr", + local->loc.path, priv->children[source]->name, + strerror (op_errno)); + + afr_sh_metadata_sync (frame, this, NULL); + } else { + dict_del (xattr, AFR_DATA_PENDING); + dict_del (xattr, AFR_METADATA_PENDING); + dict_del (xattr, AFR_ENTRY_PENDING); + afr_sh_metadata_sync (frame, this, xattr); + } + + return 0; +} + + +int +afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int active_sinks = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] == 0 && local->child_up[i] == 1) { + active_sinks++; + sh->success[i] = 1; + } + } + sh->success[source] = 1; + + if (active_sinks == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sinks for performing self-heal on file %s", + local->loc.path); + afr_sh_metadata_finish (frame, this); + return 0; + } + sh->active_sinks = active_sinks; + + gf_log (this->name, GF_LOG_DEBUG, + "syncing metadata of %s from subvolume %s to %d active sinks", + local->loc.path, priv->children[source]->name, active_sinks); + + STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, + priv->children[source], + priv->children[source]->fops->getxattr, + &local->loc, NULL); + + return 0; +} + + +int +afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int nsources = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr, + priv->child_count, AFR_METADATA_PENDING); + + afr_sh_print_pending_matrix (sh->pending_matrix, this); + + afr_sh_mark_sources (sh->pending_matrix, sh->sources, + priv->child_count); + + afr_sh_supress_errenous_children (sh->sources, sh->child_errno, + priv->child_count); + + nsources = afr_sh_source_count (sh->sources, priv->child_count); + + if ((nsources == 0) + && (priv->favorite_child != -1) + && (sh->child_errno[priv->favorite_child] == 0)) { + + gf_log (this->name, GF_LOG_WARNING, + "Picking favorite child %s as authentic source to resolve conflicting metadata of %s", + priv->children[priv->favorite_child]->name, + local->loc.path); + + sh->sources[priv->favorite_child] = 1; + + nsources = afr_sh_source_count (sh->sources, + priv->child_count); + } + + if (nsources == 0) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to resolve conflicting metadata of %s. " + "Please resolve manually by fixing the " + "permissions/ownership of %s on your subvolumes. " + "You can also consider 'option favorite-child <>'", + local->loc.path, local->loc.path); + + local->govinda_gOvinda = 1; + + afr_sh_metadata_finish (frame, this); + return 0; + } + + source = afr_sh_select_source (sh->sources, priv->child_count); + sh->source = source; + + /* detect changes not visible through pending flags -- JIC */ + for (i = 0; i < priv->child_count; i++) { + if (i == source || sh->child_errno[i]) + continue; + + if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source])) + sh->sources[i] = 0; + + if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source])) + sh->sources[i] = 0; + } + + afr_sh_metadata_sync_prepare (frame, this); + + return 0; +} + + +int +afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "path %s on subvolume %s is of mode 0%o", + local->loc.path, + priv->children[child_index]->name, + buf->st_mode); + + sh->buf[child_index] = *buf; + if (xattr) + sh->xattr[child_index] = dict_ref (xattr); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "path %s on subvolume %s => -1 (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + + sh->child_errno[child_index] = op_errno; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_metadata_fix (frame, this); + + return 0; +} + + +int +afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + dict_t *xattr_req = NULL; + int ret = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + local->call_count = call_count; + + xattr_req = dict_new(); + + if (xattr_req) + ret = dict_set_uint64 (xattr_req, AFR_METADATA_PENDING, + priv->child_count * sizeof(int32_t)); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, xattr_req); + if (!--call_count) + break; + } + } + + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + +int +afr_sh_metadata_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = (long) cookie; + + /* TODO: what if lock fails? */ + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + sh->op_failed = 1; + + gf_log (this->name, + (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), + "locking of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed) { + afr_sh_metadata_finish (frame, this); + return 0; + } + + afr_sh_metadata_lookup (frame, this); + } + + return 0; +} + + +int +afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + struct flock flock = {0, }; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + flock.l_start = 0; + flock.l_len = 0; + flock.l_type = F_WRLCK; + + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "locking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_lk_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = this->private; + + + local = frame->local; + sh = &local->self_heal; + + if (local->need_metadata_self_heal && priv->metadata_self_heal) { + afr_sh_metadata_lock (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to data check on %s", + local->loc.path); + afr_sh_metadata_done (frame, this); + } + + return 0; +} + diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h new file mode 100644 index 00000000000..1c97a9bc11b --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -0,0 +1,52 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __AFR_SELF_HEAL_H__ +#define __AFR_SELF_HEAL_H__ + +#include <sys/stat.h> + +#define FILETYPE_DIFFERS(buf1,buf2) ((S_IFMT & ((struct stat *)buf1)->st_mode) != (S_IFMT & ((struct stat *)buf2)->st_mode)) +#define PERMISSION_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_mode) != (((struct stat *)buf2)->st_mode)) +#define OWNERSHIP_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_uid) != (((struct stat *)buf2)->st_uid) || (((struct stat *)buf1)->st_gid != (((struct stat *)buf2)->st_gid))) +#define SIZE_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_size) != (((struct stat *)buf2)->st_size)) + + + +int +afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this); +int +afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this); +int +afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this); + +int +afr_self_heal_entry (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal_data (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal_metadata (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal (call_frame_t *frame, xlator_t *this, + int (*completion_cbk) (call_frame_t *, xlator_t *)); + +#endif /* __AFR_SELF_HEAL_H__ */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c new file mode 100644 index 00000000000..3df9f07e5a3 --- /dev/null +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -0,0 +1,957 @@ +/* + Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include "dict.h" +#include "byte-order.h" + +#include "afr.h" +#include "afr-transaction.h" + +#include <signal.h> + + +static void +__mark_all_pending (int32_t *pending, int child_count) +{ + int i; + + for (i = 0; i < child_count; i++) + pending[i] = hton32 (1); +} + + +static void +__mark_child_dead (int32_t *pending, int child_count, int child) +{ + pending[child] = 0; +} + + +static void +__mark_down_children (int32_t *pending, int child_count, unsigned char *child_up) +{ + int i; + + for (i = 0; i < child_count; i++) + if (!child_up[i]) + pending[i] = 0; +} + + +static void +__mark_all_success (int32_t *pending, int child_count) +{ + int i; + + for (i = 0; i < child_count; i++) + pending[i] = hton32 (-1); +} + + +static int +__is_first_write_on_fd (xlator_t *this, fd_t *fd) +{ + int op_ret = 0; + int _ret = -1; + + _ret = fd_ctx_get (fd, this, NULL); + if (_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "first writev() on fd=%p, writing changelog", + fd); + + _ret = fd_ctx_set (fd, this, 0xaf1); + op_ret = 1; + } + + return op_ret; +} + + +static int +__changelog_enabled (afr_private_t *priv, afr_transaction_type type) +{ + int ret = 0; + + switch (type) { + case AFR_DATA_TRANSACTION: + if (priv->data_change_log) + ret = 1; + + break; + + case AFR_METADATA_TRANSACTION: + if (priv->metadata_change_log) + ret = 1; + + break; + + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + if (priv->entry_change_log) + ret = 1; + + break; + + case AFR_FLUSH_TRANSACTION: + ret = 1; + } + + return ret; +} + + +static int +__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + fd_t * fd = NULL; + + int op_ret = 0; + + priv = this->private; + local = frame->local; + + if (__changelog_enabled (priv, local->transaction.type)) { + switch (local->op) { + + case GF_FOP_WRITE: + case GF_FOP_FTRUNCATE: + /* + if it's a data transaction, we write the changelog + only on the first write on an fd + */ + + fd = local->fd; + if (!fd || __is_first_write_on_fd (this, fd)) + op_ret = 1; + + break; + + case GF_FOP_FLUSH: + /* only do post-op on flush() */ + + op_ret = 0; + break; + + default: + op_ret = 1; + } + } + + return op_ret; +} + + +static int +__changelog_needed_post_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int ret = 0; + afr_transaction_type type = -1; + + priv = this->private; + local = frame->local; + type = local->transaction.type; + + if (__changelog_enabled (priv, type) + && (local->op != GF_FOP_WRITE) + && (local->op != GF_FOP_FTRUNCATE)) + ret = 1; + + return ret; +} + + +static int +afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) +{ + int ret = 0; + + switch (type) { + case AFR_FLUSH_TRANSACTION: + case AFR_DATA_TRANSACTION: + ret = priv->data_lock_server_count; + break; + + case AFR_METADATA_TRANSACTION: + ret = priv->metadata_lock_server_count; + break; + + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + ret = priv->entry_lock_server_count; + break; + } + + return ret; +} + + +/* {{{ unlock */ + +int32_t +afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local; + int call_count = 0; + + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + local->transaction.done (frame, this); + } + + return 0; +} + + +int +afr_unlock (call_frame_t *frame, xlator_t *this) +{ + struct flock flock; + + int i = 0; + int call_count = 0; + + afr_local_t *local = NULL; + afr_private_t * priv = this->private; + + local = frame->local; + + call_count = afr_locked_nodes_count (local->transaction.locked_nodes, + priv->child_count); + + if (call_count == 0) { + local->transaction.done (frame, this); + return 0; + } + + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) + call_count *= 2; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + flock.l_start = local->transaction.start; + flock.l_len = local->transaction.len; + flock.l_type = F_UNLCK; + + if (local->transaction.locked_nodes[i]) { + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + case AFR_FLUSH_TRANSACTION: + + if (local->fd) { + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->finodelk, + local->fd, F_SETLK, &flock); + } else { + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + } + + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + &local->transaction.new_parent_loc, + local->transaction.new_basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + + call_count--; + + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + if (local->fd) { + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->fentrylk, + local->fd, + local->transaction.basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + } else { + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + &local->transaction.parent_loc, + local->transaction.basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + + } + break; + } + + if (!--call_count) + break; + } + } + + return 0; +} + +/* }}} */ + + +/* {{{ pending */ + +int32_t +afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int call_count = -1; + + priv = this->private; + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + local->transaction.done (frame, this); + } else { + afr_unlock (frame, this); + } + } + + return 0; +} + + +int +afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = this->private; + + int ret = 0; + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + dict_t * xattr = dict_ref (get_new_dict ()); + + local = frame->local; + + __mark_all_success (local->pending_array, priv->child_count); + __mark_down_children (local->pending_array, priv->child_count, local->child_up); + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + call_count *= 2; + } + + local->call_count = call_count; + + if (call_count == 0) { + /* no child is up */ + dict_unref (xattr); + afr_unlock (frame, this); + return 0; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + ret = dict_set_static_bin (xattr, local->transaction.pending, + local->pending_array, + priv->child_count * sizeof (int32_t)); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "failed to set pending entry"); + + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + case AFR_FLUSH_TRANSACTION: + { + if (local->fd) + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr); + else + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, xattr); + } + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + { + STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr); + + call_count--; + } + + /* + set it again because previous stack_wind + might have already returned (think of case + where subvolume is posix) and would have + used the dict as placeholder for return + value + */ + ret = dict_set_static_bin (xattr, local->transaction.pending, + local->pending_array, + priv->child_count * sizeof (int32_t)); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "failed to set pending entry"); + + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + { + if (local->fd) + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr); + else + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr); + } + break; + } + + if (!--call_count) + break; + } + } + + dict_unref (xattr); + return 0; +} + + +int32_t +afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ + afr_local_t * local = NULL; + afr_private_t * priv = this->private; + loc_t * loc = NULL; + + int call_count = -1; + int child_index = (long) cookie; + + local = frame->local; + loc = &local->loc; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->child_up[child_index] = 0; + + if (op_errno == ENOTSUP) { + gf_log (this->name, GF_LOG_ERROR, + "xattrop not supported by %s", + priv->children[child_index]->name); + local->op_ret = -1; + } else if (!child_went_down (op_ret, op_errno)) { + gf_log (this->name, GF_LOG_ERROR, + "xattrop failed on child %s: %s", + priv->children[child_index]->name, + strerror (op_errno)); + } + local->op_errno = op_errno; + } + + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + if ((local->op_ret == -1) && + (local->op_errno == ENOTSUP)) { + local->transaction.resume (frame, this); + } else { + local->transaction.fop (frame, this); + } + } + + return 0; +} + + +int +afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = this->private; + + int i = 0; + int ret = 0; + int call_count = 0; + dict_t *xattr = NULL; + + afr_local_t *local = NULL; + + local = frame->local; + xattr = get_new_dict (); + dict_ref (xattr); + + call_count = afr_up_children_count (priv->child_count, + local->child_up); + + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + call_count *= 2; + } + + if (call_count == 0) { + /* no child is up */ + dict_unref (xattr); + afr_unlock (frame, this); + return 0; + } + + local->call_count = call_count; + + __mark_all_pending (local->pending_array, priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + ret = dict_set_static_bin (xattr, + local->transaction.pending, + local->pending_array, + (priv->child_count * + sizeof (int32_t))); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "failed to set pending entry"); + + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + case AFR_FLUSH_TRANSACTION: + { + if (local->fd) + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr); + else + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &(local->loc), + GF_XATTROP_ADD_ARRAY, xattr); + } + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + { + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr); + + call_count--; + } + + + /* + set it again because previous stack_wind + might have already returned (think of case + where subvolume is posix) and would have + used the dict as placeholder for return + value + */ + + ret = dict_set_static_bin (xattr, local->transaction.pending, + local->pending_array, + priv->child_count * sizeof (int32_t)); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "failed to set pending entry"); + + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + { + if (local->fd) + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr); + else + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr); + } + + break; + } + + if (!--call_count) + break; + } + } + + dict_unref (xattr); + return 0; +} + +/* }}} */ + +/* {{{ lock */ + +static +int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index); + +int32_t +afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int done = 0; + int child_index = (long) cookie; + + int call_count = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + /* wait for the other lock to return */ + call_count = --local->call_count; + } + + if (op_ret == -1) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_log (this->name, GF_LOG_ERROR, + "subvolume does not support locking. " + "please load features/posix-locks xlator on server"); + local->op_ret = op_ret; + done = 1; + } + + local->child_up[child_index] = 0; + local->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + if ((local->op_ret == -1) && + (local->op_errno == ENOSYS)) { + afr_unlock (frame, this); + } else { + local->transaction.locked_nodes[child_index] = 1; + local->transaction.lock_count++; + afr_lock_rec (frame, this, child_index + 1); + } + } + + return 0; +} + + +static loc_t * +lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) +{ + int ret = 0; + + ret = strcmp (l1->path, l2->path); + + if (ret == 0) + ret = strcmp (b1, b2); + + if (ret <= 0) + return l1; + else + return l2; +} + + +static +int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + struct flock flock; + + loc_t * lower = NULL; + loc_t * higher = NULL; + + const char *lower_name = NULL; + const char *higher_name = NULL; + + local = frame->local; + priv = this->private; + + flock.l_start = local->transaction.start; + flock.l_len = local->transaction.len; + flock.l_type = F_WRLCK; + + /* skip over children that are down */ + while ((child_index < priv->child_count) + && !local->child_up[child_index]) + child_index++; + + if ((child_index == priv->child_count) && + local->transaction.lock_count == 0) { + + gf_log (this->name, GF_LOG_DEBUG, + "unable to lock on even one child"); + + local->op_ret = -1; + local->op_errno = EAGAIN; + + local->transaction.done (frame, this); + + return 0; + + } + + if ((child_index == priv->child_count) + || (local->transaction.lock_count == + afr_lock_server_count (priv, local->transaction.type))) { + + /* we're done locking */ + + if (__changelog_needed_pre_op (frame, this)) { + afr_changelog_pre_op (frame, this); + } else { + local->transaction.fop (frame, this); + } + + return 0; + } + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + case AFR_FLUSH_TRANSACTION: + + if (local->fd) { + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->finodelk, + local->fd, F_SETLKW, &flock); + + } else { + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->inodelk, + &local->loc, F_SETLKW, &flock); + } + + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + { + local->call_count = 2; + + lower = lower_path (&local->transaction.parent_loc, + local->transaction.basename, + &local->transaction.new_parent_loc, + local->transaction.new_basename); + + lower_name = (lower == &local->transaction.parent_loc ? + local->transaction.basename : + local->transaction.new_basename); + + higher = (lower == &local->transaction.parent_loc ? + &local->transaction.new_parent_loc : + &local->transaction.parent_loc); + + higher_name = (higher == &local->transaction.parent_loc ? + local->transaction.basename : + local->transaction.new_basename); + + + /* TODO: these locks should be blocking */ + + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->entrylk, + lower, lower_name, + ENTRYLK_LOCK, ENTRYLK_WRLCK); + + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->entrylk, + higher, higher_name, + ENTRYLK_LOCK, ENTRYLK_WRLCK); + + break; + } + + case AFR_ENTRY_TRANSACTION: + if (local->fd) { + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->fentrylk, + local->fd, + local->transaction.basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK); + } else { + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->entrylk, + &local->transaction.parent_loc, + local->transaction.basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK); + } + + break; + } + + return 0; +} + + +int32_t afr_lock (call_frame_t *frame, xlator_t *this) +{ + return afr_lock_rec (frame, this, 0); +} + + +/* }}} */ + +int32_t +afr_transaction_resume (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + local = frame->local; + priv = this->private; + + if (__changelog_needed_post_op (frame, this)) { + afr_changelog_post_op (frame, this); + } else { + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + local->transaction.done (frame, this); + } else { + afr_unlock (frame, this); + } + } + + return 0; +} + + +/** + * afr_transaction_child_died - inform that a child died during an fop + */ + +void +afr_transaction_child_died (call_frame_t *frame, xlator_t *this, int child_index) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + local = frame->local; + priv = this->private; + + __mark_child_dead (local->pending_array, priv->child_count, child_index); +} + + +int32_t +afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + local = frame->local; + priv = this->private; + + afr_transaction_local_init (local, priv); + + local->transaction.resume = afr_transaction_resume; + local->transaction.type = type; + + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + if (__changelog_needed_pre_op (frame, this)) { + afr_changelog_pre_op (frame, this); + } else { + local->transaction.fop (frame, this); + } + } else { + afr_lock (frame, this); + } + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h new file mode 100644 index 00000000000..49cdd219f25 --- /dev/null +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -0,0 +1,36 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __TRANSACTION_H__ +#define __TRANSACTION_H__ + +#define AFR_METADATA_PENDING "trusted.glusterfs.afr.metadata-pending" + +#define AFR_DATA_PENDING "trusted.glusterfs.afr.data-pending" + +#define AFR_ENTRY_PENDING "trusted.glusterfs.afr.entry-pending" + +void +afr_transaction_child_died (call_frame_t *frame, xlator_t *this, + int child_index); + +int32_t +afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); + +#endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c new file mode 100644 index 00000000000..e4c1a847985 --- /dev/null +++ b/xlators/cluster/afr/src/afr.c @@ -0,0 +1,2338 @@ +/* + Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-inode-read.h" +#include "afr-inode-write.h" +#include "afr-dir-read.h" +#include "afr-dir-write.h" +#include "afr-transaction.h" + +#include "afr-self-heal.h" + + +/** + * afr_local_cleanup - cleanup everything in frame->local + */ + +void +afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) +{ + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + + + sh = &local->self_heal; + priv = this->private; + + if (sh->buf) + FREE (sh->buf); + + if (sh->xattr) { + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + dict_unref (sh->xattr[i]); + sh->xattr[i] = NULL; + } + } + FREE (sh->xattr); + } + + if (sh->child_errno) + FREE (sh->child_errno); + + if (sh->pending_matrix) { + for (i = 0; i < priv->child_count; i++) { + FREE (sh->pending_matrix[i]); + } + FREE (sh->pending_matrix); + } + + if (sh->delta_matrix) { + for (i = 0; i < priv->child_count; i++) { + FREE (sh->delta_matrix[i]); + } + FREE (sh->delta_matrix); + } + + if (sh->sources) + FREE (sh->sources); + + if (sh->success) + FREE (sh->success); + + if (sh->healing_fd) { + fd_unref (sh->healing_fd); + sh->healing_fd = NULL; + } + + loc_wipe (&sh->parent_loc); +} + + +void +afr_local_cleanup (afr_local_t *local, xlator_t *this) +{ + if (!local) + return; + + afr_local_sh_cleanup (local, this); + + FREE (local->child_errno); + FREE (local->pending_array); + + loc_wipe (&local->loc); + loc_wipe (&local->newloc); + + FREE (local->transaction.locked_nodes); + FREE (local->transaction.child_errno); + + FREE (local->transaction.basename); + FREE (local->transaction.new_basename); + + loc_wipe (&local->transaction.parent_loc); + loc_wipe (&local->transaction.new_parent_loc); + + if (local->fd) + fd_unref (local->fd); + + if (local->xattr_req) + dict_unref (local->xattr_req); + + FREE (local->child_up); + + { /* lookup */ + if (local->cont.lookup.xattr) + dict_unref (local->cont.lookup.xattr); + } + + { /* getxattr */ + if (local->cont.getxattr.name) + FREE (local->cont.getxattr.name); + } + + { /* lk */ + if (local->cont.lk.locked_nodes) + FREE (local->cont.lk.locked_nodes); + } + + { /* checksum */ + if (local->cont.checksum.file_checksum) + FREE (local->cont.checksum.file_checksum); + if (local->cont.checksum.dir_checksum) + FREE (local->cont.checksum.dir_checksum); + } + + { /* create */ + if (local->cont.create.fd) + fd_unref (local->cont.create.fd); + } + + { /* writev */ + FREE (local->cont.writev.vector); + } + + { /* setxattr */ + if (local->cont.setxattr.dict) + dict_unref (local->cont.setxattr.dict); + } + + { /* removexattr */ + FREE (local->cont.removexattr.name); + } + + { /* symlink */ + FREE (local->cont.symlink.linkpath); + } +} + + +int +afr_frame_return (call_frame_t *frame) +{ + afr_local_t *local = NULL; + int call_count = 0; + + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + return call_count; +} + +/** + * first_up_child - return the index of the first child that is up + */ + +int +afr_first_up_child (afr_private_t *priv) +{ + xlator_t ** children = NULL; + int ret = -1; + int i = 0; + + LOCK (&priv->lock); + { + children = priv->children; + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i]) { + ret = i; + break; + } + } + } + UNLOCK (&priv->lock); + + return ret; +} + + +/** + * up_children_count - return the number of children that are up + */ + +int +afr_up_children_count (int child_count, unsigned char *child_up) +{ + int i = 0; + int ret = 0; + + for (i = 0; i < child_count; i++) + if (child_up[i]) + ret++; + return ret; +} + + +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) +{ + int ret = 0; + int i; + + for (i = 0; i < child_count; i++) + if (locked_nodes[i]) + ret++; + + return ret; +} + + +ino64_t +afr_itransform (ino64_t ino, int child_count, int child_index) +{ + ino64_t scaled_ino = -1; + + if (ino == ((uint64_t) -1)) { + scaled_ino = ((uint64_t) -1); + goto out; + } + + scaled_ino = (ino * child_count) + child_index; + +out: + return scaled_ino; +} + + +int +afr_deitransform_orig (ino64_t ino, int child_count) +{ + int index = -1; + + index = ino % child_count; + + return index; +} + + +int +afr_deitransform (ino64_t ino, int child_count) +{ + return 0; +} + + +int +afr_self_heal_cbk (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + int ret = -1; + + local = frame->local; + + if (local->govinda_gOvinda) { + ret = inode_ctx_put (local->cont.lookup.inode, this, 1); + + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + } + } else { + inode_ctx_del (local->cont.lookup.inode, this, NULL); + } + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->cont.lookup.inode, + &local->cont.lookup.buf, + local->cont.lookup.xattr); + + return 0; +} + + +int +afr_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + struct stat * lookup_buf = NULL; + int call_count = -1; + int child_index = -1; + int prev_child_index = -1; + uint32_t open_fd_count = 0; + int ret = 0; + + child_index = (long) cookie; + priv = this->private; + + LOCK (&frame->lock); + { + local = frame->local; + + lookup_buf = &local->cont.lookup.buf; + + if (op_ret == -1) { + if (op_errno == ENOENT) + local->enoent_count++; + + if (op_errno != ENOTCONN) + local->op_errno = op_errno; + + goto unlock; + } + + if (afr_sh_has_metadata_pending (xattr, child_index, this)) + local->need_metadata_self_heal = 1; + + if (afr_sh_has_entry_pending (xattr, child_index, this)) + local->need_entry_self_heal = 1; + + if (afr_sh_has_data_pending (xattr, child_index, this)) + local->need_data_self_heal = 1; + + ret = dict_get_uint32 (xattr, GLUSTERFS_OPEN_FD_COUNT, + &open_fd_count); + local->open_fd_count += open_fd_count; + + /* in case of revalidate, we need to send stat of the + * child whose stat was sent during the first lookup. + * (so that time stamp does not vary with revalidate. + * in case it is down, stat of the fist success will + * be replied */ + + /* inode number should be preserved across revalidates */ + + if (local->success_count == 0) { + local->op_ret = op_ret; + + local->cont.lookup.inode = inode; + local->cont.lookup.xattr = dict_ref (xattr); + + *lookup_buf = *buf; + lookup_buf->st_ino = afr_itransform (buf->st_ino, + priv->child_count, + child_index); + } else { + if (FILETYPE_DIFFERS (buf, lookup_buf)) { + /* mismatching filetypes with same name + -- Govinda !! GOvinda !!! + */ + local->govinda_gOvinda = 1; + } + + if (PERMISSION_DIFFERS (buf, lookup_buf)) { + /* mismatching permissions */ + local->need_metadata_self_heal = 1; + } + + if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { + /* mismatching permissions */ + local->need_metadata_self_heal = 1; + } + + if (SIZE_DIFFERS (buf, lookup_buf) + && S_ISREG (buf->st_mode)) { + local->need_data_self_heal = 1; + } + + prev_child_index = afr_deitransform_orig (lookup_buf->st_ino, + priv->child_count); + if (child_index < prev_child_index) { + *lookup_buf = *buf; + lookup_buf->st_ino = afr_itransform (buf->st_ino, + priv->child_count, + child_index); + } + } + + local->success_count++; + } +unlock: + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (local->op_ret == 0) { + /* KLUDGE: assuming DHT will not itransform in + revalidate */ + if (local->cont.lookup.inode->ino) + lookup_buf->st_ino = + local->cont.lookup.inode->ino; + } + + if (local->success_count && local->enoent_count) { + local->need_metadata_self_heal = 1; + local->need_data_self_heal = 1; + local->need_entry_self_heal = 1; + } + + if (local->success_count) { + /* check for govinda_gOvinda case in previous lookup */ + if (!inode_ctx_get (local->cont.lookup.inode, + this, NULL)) + local->need_data_self_heal = 1; + } + + if ((local->need_metadata_self_heal + || local->need_data_self_heal + || local->need_entry_self_heal) + && (!local->open_fd_count)) { + + if (!local->cont.lookup.inode->st_mode) { + /* fix for RT #602 */ + local->cont.lookup.inode->st_mode = + lookup_buf->st_mode; + } + + afr_self_heal (frame, this, afr_self_heal_cbk); + } else { + AFR_STACK_UNWIND (frame, local->op_ret, + local->op_errno, + local->cont.lookup.inode, + &local->cont.lookup.buf, + local->cont.lookup.xattr); + } + } + + return 0; +} + + +int +afr_lookup (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr_req) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int ret = -1; + int i = 0; + int32_t op_errno = 0; + + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + local->op_ret = -1; + + frame->local = local; + + loc_copy (&local->loc, loc); + + local->reval_child_index = 0; + + local->call_count = priv->child_count; + + local->child_up = memdup (priv->child_up, priv->child_count); + local->child_count = afr_up_children_count (priv->child_count, + local->child_up); + + /* By default assume ENOTCONN. On success it will be set to 0. */ + local->op_errno = ENOTCONN; + + if ((xattr_req == NULL) + && (priv->metadata_self_heal + || priv->data_self_heal + || priv->entry_self_heal)) + local->xattr_req = dict_new (); + else + local->xattr_req = dict_ref (xattr_req); + + if (priv->metadata_self_heal) { + ret = dict_set_uint64 (local->xattr_req, AFR_METADATA_PENDING, + priv->child_count * sizeof(int32_t)); + } + + if (priv->data_self_heal) { + ret = dict_set_uint64 (local->xattr_req, AFR_DATA_PENDING, + priv->child_count * sizeof(int32_t)); + } + + if (priv->entry_self_heal) { + ret = dict_set_uint64 (local->xattr_req, AFR_ENTRY_PENDING, + priv->child_count * sizeof(int32_t)); + } + + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 0); + + for (i = 0; i < priv->child_count; i++) { + STACK_WIND_COOKIE (frame, afr_lookup_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + loc, local->xattr_req); + } + + ret = 0; +out: + if (ret == -1) + AFR_STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL, NULL); + + return 0; +} + + +/* {{{ open */ + +int +afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = frame->local; + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->fd); + return 0; +} + + +int +afr_open_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + fd_t *fd) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + + priv = this->private; + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + } + + if (op_ret >= 0) { + local->op_ret = op_ret; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if ((local->cont.open.flags & O_TRUNC) + && (local->op_ret >= 0)) { + STACK_WIND (frame, afr_open_ftruncate_cbk, + this, this->fops->ftruncate, + fd, 0); + } else { + AFR_STACK_UNWIND (frame, local->op_ret, + local->op_errno, local->fd); + } + } + + return 0; +} + + +int +afr_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int i = 0; + int ret = -1; + + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + int32_t wind_flags = flags & (~O_TRUNC); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + + ret = inode_ctx_get (loc->inode, this, NULL); + if (ret == 0) { + /* if ctx is set it means self-heal failed */ + + gf_log (this->name, GF_LOG_WARNING, + "returning EIO, file has to be manually corrected " + "in backend"); + op_errno = EIO; + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + call_count = local->call_count; + + local->cont.open.flags = flags; + local->fd = fd_ref (fd); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->open, + loc, wind_flags, fd); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, fd); + } + + return 0; +} + +/* }}} */ + +/* {{{ flush */ + +int +afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_flush_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int i = 0; + int call_count = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_flush_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->flush, + local->fd); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_flush_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +afr_simple_flush_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +static int +__is_fd_ctx_set (xlator_t *this, fd_t *fd) +{ + int _ret = 0; + int op_ret = 0; + + _ret = fd_ctx_get (fd, this, NULL); + if (_ret == 0) + op_ret = 1; + + return op_ret; +} + + +int +afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int ret = -1; + int i = 0; + int call_count = 0; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + + if (__is_fd_ctx_set (this, fd)) { + local->op = GF_FOP_FLUSH; + local->transaction.fop = afr_flush_wind; + local->transaction.done = afr_flush_done; + + local->fd = fd_ref (fd); + + local->transaction.start = 0; + local->transaction.len = 0; + + local->transaction.pending = AFR_DATA_PENDING; + + afr_transaction (frame, this, AFR_FLUSH_TRANSACTION); + } else { + /* + * if fd's ctx is not set, then there is no need + * to erase changelog. So just send the flush + */ + + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_simple_flush_cbk, + priv->children[i], + priv->children[i]->fops->flush, + fd); + + if (!--call_count) + break; + } + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ fsync */ + +int +afr_fsync_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t datasync) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fsync_cbk, + priv->children[i], + priv->children[i]->fops->fsync, + fd, datasync); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +/* }}} */ + +/* {{{ fsync */ + +int32_t +afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t datasync) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fsync_cbk, + priv->children[i], + priv->children[i]->fops->fsyncdir, + fd, datasync); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +/* }}} */ + +/* {{{ xattrop */ + +int32_t +afr_xattrop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr); + + return 0; +} + + +int32_t +afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_xattrop_cbk, + priv->children[i], + priv->children[i]->fops->xattrop, + loc, optype, xattr); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +/* }}} */ + +/* {{{ fxattrop */ + +int32_t +afr_fxattrop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr); + + return 0; +} + + +int32_t +afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fxattrop_cbk, + priv->children[i], + priv->children[i]->fops->fxattrop, + fd, optype, xattr); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +/* }}} */ + + +int32_t +afr_inodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_inodelk (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t cmd, struct flock *flock) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_inodelk_cbk, + priv->children[i], + priv->children[i]->fops->inodelk, + loc, cmd, flock); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +int32_t +afr_finodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_finodelk (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t cmd, struct flock *flock) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_finodelk_cbk, + priv->children[i], + priv->children[i]->fops->finodelk, + fd, cmd, flock); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +int32_t +afr_entrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_entrylk_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + loc, basename, cmd, type); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + + +int32_t +afr_fentrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_fentrylk (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fentrylk_cbk, + priv->children[i], + priv->children[i]->fops->fentrylk, + fd, basename, cmd, type); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +int32_t +afr_checksum_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + uint8_t *file_checksum, uint8_t *dir_checksum) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0 && (local->op_ret != 0)) { + local->op_ret = 0; + + local->cont.checksum.file_checksum = MALLOC (ZR_FILENAME_MAX); + memcpy (local->cont.checksum.file_checksum, file_checksum, + ZR_FILENAME_MAX); + + local->cont.checksum.dir_checksum = MALLOC (ZR_FILENAME_MAX); + memcpy (local->cont.checksum.dir_checksum, dir_checksum, + ZR_FILENAME_MAX); + + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->cont.checksum.file_checksum, + local->cont.checksum.dir_checksum); + + return 0; +} + + +int32_t +afr_checksum (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flag) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_checksum_cbk, + priv->children[i], + priv->children[i]->fops->checksum, + loc, flag); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +int32_t +afr_statfs_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct statvfs *statvfs) +{ + afr_local_t *local = NULL; + + int call_count = 0; + + LOCK (&frame->lock); + { + local = frame->local; + + if (op_ret == 0) { + local->op_ret = op_ret; + + if (local->cont.statfs.buf_set) { + if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) + local->cont.statfs.buf = *statvfs; + } else { + local->cont.statfs.buf = *statvfs; + local->cont.statfs.buf_set = 1; + } + } + + if (op_ret == -1) + local->op_errno = op_errno; + + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->cont.statfs.buf); + + return 0; +} + + +int32_t +afr_statfs (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + afr_private_t * priv = NULL; + int child_count = 0; + afr_local_t * local = NULL; + int i = 0; + + int ret = -1; + int call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + child_count = priv->child_count; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + call_count = local->call_count; + + for (i = 0; i < child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_statfs_cbk, + priv->children[i], + priv->children[i]->fops->statfs, + loc); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +int32_t +afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct flock *lock) +{ + afr_local_t * local = NULL; + + int call_count = -1; + + local = frame->local; + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + lock); + + return 0; +} + + +int32_t +afr_lk_unlock (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int i; + int call_count = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes, + priv->child_count); + + if (call_count == 0) { + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->cont.lk.flock); + return 0; + } + + local->call_count = call_count; + + local->cont.lk.flock.l_type = F_UNLCK; + + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lk.locked_nodes[i]) { + STACK_WIND (frame, afr_lk_unlock_cbk, + priv->children[i], + priv->children[i]->fops->lk, + local->fd, F_SETLK, + &local->cont.lk.flock); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int32_t +afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct flock *lock) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + call_count = --local->call_count; + + if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) { + local->op_ret = -1; + local->op_errno = op_errno; + + afr_lk_unlock (frame, this); + return 0; + } + + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lk.flock = *lock; + local->cont.lk.locked_nodes[child_index] = 1; + } + + child_index++; + + if (child_index < priv->child_count) { + STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->lk, + local->fd, local->cont.lk.cmd, + &local->cont.lk.flock); + } else if (local->op_ret == -1) { + /* all nodes have gone down */ + + AFR_STACK_UNWIND (frame, -1, ENOTCONN, &local->cont.lk.flock); + } else { + /* locking has succeeded on all nodes that are up */ + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->cont.lk.flock); + } + + return 0; +} + + +int +afr_lk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, + struct flock *flock) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int i = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_INIT (local, priv); + + frame->local = local; + + local->cont.lk.locked_nodes = CALLOC (priv->child_count, + sizeof (*local->cont.lk.locked_nodes)); + + if (!local->cont.lk.locked_nodes) { + gf_log (this->name, GF_LOG_ERROR, "out of memory :("); + op_errno = ENOMEM; + goto out; + } + + local->fd = fd_ref (fd); + local->cont.lk.cmd = cmd; + local->cont.lk.flock = *flock; + + STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, + priv->children[i], + priv->children[i]->fops->lk, + fd, cmd, flock); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +/** + * find_child_index - find the child's index in the array of subvolumes + * @this: AFR + * @child: child + */ + +static int +find_child_index (xlator_t *this, xlator_t *child) +{ + afr_private_t *priv = NULL; + + int i = -1; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if ((xlator_t *) child == priv->children[i]) + break; + } + + return i; +} + + +int32_t +notify (xlator_t *this, int32_t event, + void *data, ...) +{ + afr_private_t * priv = NULL; + unsigned char * child_up = NULL; + + int i = -1; + int up_children = 0; + + priv = this->private; + + if (!priv) + return 0; + + child_up = priv->child_up; + + switch (event) { + case GF_EVENT_CHILD_UP: + i = find_child_index (this, data); + + child_up[i] = 1; + + /* + if all the children were down, and one child came up, + send notify to parent + */ + + for (i = 0; i < priv->child_count; i++) + if (child_up[i]) + up_children++; + + if (up_children == 1) + default_notify (this, event, data); + + break; + + case GF_EVENT_CHILD_DOWN: + i = find_child_index (this, data); + + child_up[i] = 0; + + /* + if all children are down, and this was the last to go down, + send notify to parent + */ + + for (i = 0; i < priv->child_count; i++) + if (child_up[i]) + up_children++; + + if (up_children == 0) + default_notify (this, event, data); + + break; + + default: + default_notify (this, event, data); + } + + return 0; +} + + +static const char *favorite_child_warning_str = "You have specified subvolume '%s' " + "as the 'favorite child'. This means that if a discrepancy in the content " + "or attributes (ownership, permission, etc.) of a file is detected among " + "the subvolumes, the file on '%s' will be considered the definitive " + "version and its contents will OVERWRITE the contents of the file on other " + "subvolumes. All versions of the file except that on '%s' " + "WILL BE LOST."; + +static const char *no_lock_servers_warning_str = "You have set lock-server-count = 0. " + "This means correctness is NO LONGER GUARANTEED in all cases. If two or more " + "applications write to the same region of a file, there is a possibility that " + "its copies will be INCONSISTENT. Set it to a value greater than 0 unless you " + "are ABSOLUTELY SURE of what you are doing and WILL NOT HOLD GlusterFS " + "RESPOSIBLE for inconsistent data. If you are in doubt, set it to a value " + "greater than 0."; + +int32_t +init (xlator_t *this) +{ + afr_private_t * priv = NULL; + int child_count = 0; + xlator_list_t * trav = NULL; + int i = 0; + int ret = -1; + int op_errno = 0; + + char * read_subvol = NULL; + char * fav_child = NULL; + char * self_heal = NULL; + char * change_log = NULL; + + int32_t lock_server_count = 1; + + int fav_ret = -1; + int read_ret = -1; + int dict_ret = -1; + + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, + "AFR needs more than one child defined"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + ALLOC_OR_GOTO (this->private, afr_private_t, out); + + priv = this->private; + + read_ret = dict_get_str (this->options, "read-subvolume", &read_subvol); + priv->read_child = -1; + + fav_ret = dict_get_str (this->options, "favorite-child", &fav_child); + priv->favorite_child = -1; + + /* Default values */ + + priv->data_self_heal = 1; + priv->metadata_self_heal = 1; + priv->entry_self_heal = 1; + + dict_ret = dict_get_str (this->options, "data-self-heal", &self_heal); + if (dict_ret == 0) { + ret = gf_string2boolean (self_heal, &priv->data_self_heal); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option data-self-heal %s' " + "defaulting to data-self-heal as 'on'", + self_heal); + priv->data_self_heal = 1; + } + } + + dict_ret = dict_get_str (this->options, "metadata-self-heal", + &self_heal); + if (dict_ret == 0) { + ret = gf_string2boolean (self_heal, &priv->metadata_self_heal); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option metadata-self-heal %s' " + "defaulting to metadata-self-heal as 'on'", + self_heal); + priv->metadata_self_heal = 1; + } + } + + dict_ret = dict_get_str (this->options, "entry-self-heal", &self_heal); + if (dict_ret == 0) { + ret = gf_string2boolean (self_heal, &priv->entry_self_heal); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option entry-self-heal %s' " + "defaulting to entry-self-heal as 'on'", + self_heal); + priv->entry_self_heal = 1; + } + } + + /* Change log options */ + + priv->data_change_log = 1; + priv->metadata_change_log = 0; + priv->entry_change_log = 1; + + dict_ret = dict_get_str (this->options, "data-change-log", + &change_log); + if (dict_ret == 0) { + ret = gf_string2boolean (change_log, &priv->data_change_log); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option data-change-log %s'. " + "defaulting to data-change-log as 'on'", + change_log); + priv->data_change_log = 1; + } + } + + dict_ret = dict_get_str (this->options, "metadata-change-log", + &change_log); + if (dict_ret == 0) { + ret = gf_string2boolean (change_log, + &priv->metadata_change_log); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option metadata-change-log %s'. " + "defaulting to metadata-change-log as 'off'", + change_log); + priv->metadata_change_log = 0; + } + } + + dict_ret = dict_get_str (this->options, "entry-change-log", + &change_log); + if (dict_ret == 0) { + ret = gf_string2boolean (change_log, &priv->entry_change_log); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option entry-change-log %s'. " + "defaulting to entry-change-log as 'on'", + change_log); + priv->entry_change_log = 1; + } + } + + /* Locking options */ + + priv->data_lock_server_count = 1; + priv->metadata_lock_server_count = 0; + priv->entry_lock_server_count = 1; + + dict_ret = dict_get_int32 (this->options, "data-lock-server-count", + &lock_server_count); + if (dict_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setting data lock server count to %d", + lock_server_count); + + if (lock_server_count == 0) + gf_log (this->name, GF_LOG_WARNING, + no_lock_servers_warning_str); + + priv->data_lock_server_count = lock_server_count; + } + + + dict_ret = dict_get_int32 (this->options, + "metadata-lock-server-count", + &lock_server_count); + if (dict_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setting metadata lock server count to %d", + lock_server_count); + priv->metadata_lock_server_count = lock_server_count; + } + + + dict_ret = dict_get_int32 (this->options, "entry-lock-server-count", + &lock_server_count); + if (dict_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setting entry lock server count to %d", + lock_server_count); + + priv->entry_lock_server_count = lock_server_count; + } + + + trav = this->children; + while (trav) { + if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) { + gf_log (this->name, GF_LOG_DEBUG, + "subvolume '%s' specified as read child", + trav->xlator->name); + + priv->read_child = child_count; + } + + if (fav_ret == 0 && !strcmp (fav_child, trav->xlator->name)) { + gf_log (this->name, GF_LOG_WARNING, + favorite_child_warning_str, trav->xlator->name, + trav->xlator->name, trav->xlator->name); + priv->favorite_child = child_count; + } + + child_count++; + trav = trav->next; + } + + /* XXX: return inode numbers from 1st subvolume till + afr supports read-subvolume based on inode's ctx + (and not itransform) for this reason afr_deitransform() + returns 0 always + */ + priv->read_child = 0; + + priv->wait_count = 1; + + priv->child_count = child_count; + LOCK_INIT (&priv->lock); + + priv->child_up = CALLOC (sizeof (unsigned char), child_count); + if (!priv->child_up) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_errno = ENOMEM; + goto out; + } + + priv->children = CALLOC (sizeof (xlator_t *), child_count); + if (!priv->children) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_errno = ENOMEM; + goto out; + } + + trav = this->children; + i = 0; + while (i < child_count) { + priv->children[i] = trav->xlator; + + trav = trav->next; + i++; + } + + ret = 0; +out: + return ret; +} + + +int +fini (xlator_t *this) +{ + return 0; +} + + +struct xlator_fops fops = { + .lookup = afr_lookup, + .open = afr_open, + .lk = afr_lk, + .flush = afr_flush, + .statfs = afr_statfs, + .fsync = afr_fsync, + .fsyncdir = afr_fsyncdir, + .xattrop = afr_xattrop, + .fxattrop = afr_fxattrop, + .inodelk = afr_inodelk, + .finodelk = afr_finodelk, + .entrylk = afr_entrylk, + .fentrylk = afr_fentrylk, + .checksum = afr_checksum, + + /* inode read */ + .access = afr_access, + .stat = afr_stat, + .fstat = afr_fstat, + .readlink = afr_readlink, + .getxattr = afr_getxattr, + .readv = afr_readv, + + /* inode write */ + .chmod = afr_chmod, + .chown = afr_chown, + .fchmod = afr_fchmod, + .fchown = afr_fchown, + .writev = afr_writev, + .truncate = afr_truncate, + .ftruncate = afr_ftruncate, + .utimens = afr_utimens, + .setxattr = afr_setxattr, + .removexattr = afr_removexattr, + + /* dir read */ + .opendir = afr_opendir, + .readdir = afr_readdir, + .getdents = afr_getdents, + + /* dir write */ + .create = afr_create, + .mknod = afr_mknod, + .mkdir = afr_mkdir, + .unlink = afr_unlink, + .rmdir = afr_rmdir, + .link = afr_link, + .symlink = afr_symlink, + .rename = afr_rename, + .setdents = afr_setdents, +}; + + +struct xlator_mops mops = { +}; + + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = {"read-subvolume" }, + .type = GF_OPTION_TYPE_XLATOR + }, + { .key = {"favorite-child"}, + .type = GF_OPTION_TYPE_XLATOR + }, + { .key = {"data-self-heal"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"metadata-self-heal"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"entry-self-heal"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"data-change-log"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"metadata-change-log"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"entry-change-log"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"data-lock-server-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 0 + }, + { .key = {"metadata-lock-server-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 0 + }, + { .key = {"entry-lock-server-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 0 + }, + { .key = {NULL} }, +}; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h new file mode 100644 index 00000000000..4cf6cdf9dfe --- /dev/null +++ b/xlators/cluster/afr/src/afr.h @@ -0,0 +1,523 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#ifndef __AFR_H__ +#define __AFR_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "scheduler.h" +#include "call-stub.h" +#include "compat-errno.h" + + +typedef struct _afr_private { + gf_lock_t lock; /* to guard access to child_count, etc */ + unsigned int child_count; /* total number of children */ + + xlator_t **children; + + unsigned char *child_up; + + gf_boolean_t data_self_heal; /* on/off */ + gf_boolean_t metadata_self_heal; /* on/off */ + gf_boolean_t entry_self_heal; /* on/off */ + + + gf_boolean_t data_change_log; /* on/off */ + gf_boolean_t metadata_change_log; /* on/off */ + gf_boolean_t entry_change_log; /* on/off */ + + unsigned int read_child; /* read-subvolume */ + unsigned int favorite_child; /* subvolume to be preferred in resolving + split-brain cases */ + + unsigned int data_lock_server_count; + unsigned int metadata_lock_server_count; + unsigned int entry_lock_server_count; + + unsigned int wait_count; /* # of servers to wait for success */ +} afr_private_t; + +typedef struct { + /* array of stat's, one for each child */ + struct stat *buf; + + /* array of xattr's, one for each child */ + dict_t **xattr; + + /* array of errno's, one for each child */ + int *child_errno; + + int32_t **pending_matrix; + int32_t **delta_matrix; + + int *sources; + int source; + int active_source; + int active_sinks; + int *success; + + fd_t *healing_fd; + int op_failed; + + int file_has_holes; + blksize_t block_size; + off_t file_size; + off_t offset; + + loc_t parent_loc; + int (*completion_cbk) (call_frame_t *frame, xlator_t *this); + call_frame_t *sh_frame; +} afr_self_heal_t; + + +typedef enum { + AFR_DATA_TRANSACTION, /* truncate, write, ... */ + AFR_METADATA_TRANSACTION, /* chmod, chown, ... */ + AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */ + AFR_ENTRY_RENAME_TRANSACTION, /* rename */ + AFR_FLUSH_TRANSACTION, /* flush */ +} afr_transaction_type; + +typedef struct _afr_local { + unsigned int call_count; + unsigned int success_count; + unsigned int enoent_count; + + unsigned int need_metadata_self_heal; + unsigned int need_entry_self_heal; + unsigned int need_data_self_heal; + unsigned int govinda_gOvinda; + + unsigned int reval_child_index; + int32_t op_ret; + int32_t op_errno; + + int32_t *pending_array; + + loc_t loc; + loc_t newloc; + + fd_t *fd; + + glusterfs_fop_t fop; + + unsigned char *child_up; + int child_count; + + int32_t *child_errno; + + dict_t *xattr_req; + int open_fd_count; + /* + This struct contains the arguments for the "continuation" + (scheme-like) of fops + */ + + int op; + struct { + struct { + unsigned char buf_set; + struct statvfs buf; + } statfs; + + struct { + inode_t *inode; + struct stat buf; + dict_t *xattr; + } lookup; + + struct { + int32_t flags; + } open; + + struct { + int32_t cmd; + struct flock flock; + unsigned char *locked_nodes; + } lk; + + struct { + uint8_t *file_checksum; + uint8_t *dir_checksum; + } checksum; + + /* inode read */ + + struct { + int32_t mask; + int last_tried; /* index of the child we tried previously */ + } access; + + struct { + int last_tried; + ino_t ino; + } stat; + + struct { + int last_tried; + ino_t ino; + } fstat; + + struct { + size_t size; + int last_tried; + } readlink; + + struct { + const char *name; + int last_tried; + } getxattr; + + struct { + size_t size; + off_t offset; + int last_tried; + } readv; + + /* dir read */ + + struct { + int success_count; + int32_t op_ret; + int32_t op_errno; + } opendir; + + struct { + int32_t op_ret; + int32_t op_errno; + size_t size; + off_t offset; + + int last_tried; + } readdir; + + struct { + int32_t op_ret; + int32_t op_errno; + + size_t size; + off_t offset; + int32_t flag; + + int last_tried; + } getdents; + + /* inode write */ + + struct { + ino_t ino; + mode_t mode; + struct stat buf; + } chmod; + + struct { + ino_t ino; + mode_t mode; + struct stat buf; + } fchmod; + + struct { + ino_t ino; + uid_t uid; + gid_t gid; + struct stat buf; + } chown; + + struct { + ino_t ino; + uid_t uid; + gid_t gid; + struct stat buf; + } fchown; + + struct { + ino_t ino; + struct stat buf; + + int32_t op_ret; + + struct iovec *vector; + dict_t *refs; + int32_t count; + off_t offset; + } writev; + + struct { + ino_t ino; + off_t offset; + struct stat buf; + } truncate; + + struct { + ino_t ino; + off_t offset; + struct stat buf; + } ftruncate; + + struct { + ino_t ino; + struct timespec tv[2]; + struct stat buf; + } utimens; + + struct { + dict_t *dict; + int32_t flags; + } setxattr; + + struct { + const char *name; + } removexattr; + + /* dir write */ + + struct { + ino_t ino; + fd_t *fd; + int32_t flags; + mode_t mode; + inode_t *inode; + struct stat buf; + } create; + + struct { + ino_t ino; + dev_t dev; + mode_t mode; + inode_t *inode; + struct stat buf; + } mknod; + + struct { + ino_t ino; + int32_t mode; + inode_t *inode; + struct stat buf; + } mkdir; + + struct { + int32_t op_ret; + int32_t op_errno; + } unlink; + + struct { + int32_t op_ret; + int32_t op_errno; + } rmdir; + + struct { + ino_t ino; + struct stat buf; + } rename; + + struct { + ino_t ino; + inode_t *inode; + struct stat buf; + } link; + + struct { + ino_t ino; + inode_t *inode; + struct stat buf; + char *linkpath; + } symlink; + + struct { + int32_t flags; + dir_entry_t *entries; + int32_t count; + } setdents; + } cont; + + struct { + off_t start, len; + + unsigned char *locked_nodes; + int lock_count; + + const char *basename; + const char *new_basename; + + char *pending; + + loc_t parent_loc; + loc_t new_parent_loc; + + afr_transaction_type type; + + int success_count; + int erase_pending; + int failure_count; + + int last_tried; + int32_t *child_errno; + + call_frame_t *main_frame; + + int (*fop) (call_frame_t *frame, xlator_t *this); + + int (*done) (call_frame_t *frame, xlator_t *this); + + int (*resume) (call_frame_t *frame, xlator_t *this); + + int (*unwind) (call_frame_t *frame, xlator_t *this); + } transaction; + + afr_self_heal_t self_heal; +} afr_local_t; + +/* try alloc and if it fails, goto label */ +#define ALLOC_OR_GOTO(var, type, label) do { \ + var = CALLOC (sizeof (type), 1); \ + if (!var) { \ + gf_log (this->name, GF_LOG_ERROR, \ + "out of memory :("); \ + op_errno = ENOMEM; \ + goto label; \ + } \ + } while (0); + + +/* did a call fail due to a child failing? */ +#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ + ((op_errno == ENOTCONN) || \ + (op_errno == EBADFD))) + +/* have we tried all children? */ +#define all_tried(i, count) ((i) == (count) - 1) + +void +afr_build_parent_loc (loc_t *parent, loc_t *child); + +int +afr_up_children_count (int child_count, unsigned char *child_up); + +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); + +int +afr_first_up_child (afr_private_t *priv); + +ino64_t +afr_itransform (ino64_t ino, int child_count, int child_index); + +int +afr_deitransform (ino64_t ino, int child_count); + +void +afr_local_cleanup (afr_local_t *local, xlator_t *this); + +int +afr_frame_return (call_frame_t *frame); + +#define AFR_STACK_UNWIND(frame, params ...) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + STACK_UNWIND (frame, params); \ + afr_local_cleanup (__local, __this); \ + free (__local); \ +} while (0); + +#define AFR_STACK_DESTROY(frame) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + afr_local_cleanup (__local, __this); \ + free (__local); \ +} while (0); + +/* allocate and return a string that is the basename of argument */ +static inline char * +AFR_BASENAME (const char *str) +{ + char *__tmp_str = NULL; + char *__basename_str = NULL; + __tmp_str = strdup (str); + __basename_str = strdup (basename (__tmp_str)); + FREE (__tmp_str); + return __basename_str; +} + +/* initialize local_t */ +static inline int +AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv) +{ + local->child_up = CALLOC (sizeof (*local->child_up), + priv->child_count); + if (!local->child_up) { + return -ENOMEM; + } + + memcpy (local->child_up, priv->child_up, + sizeof (*local->child_up) * priv->child_count); + + + local->call_count = afr_up_children_count (priv->child_count, local->child_up); + if (local->call_count == 0) + return -ENOTCONN; + + local->transaction.erase_pending = 1; + + local->op_ret = -1; + local->op_errno = EUCLEAN; + + return 0; +} + + +static inline int +afr_transaction_local_init (afr_local_t *local, afr_private_t *priv) +{ + local->child_errno = CALLOC (sizeof (*local->child_errno), + priv->child_count); + if (!local->child_errno) { + return -ENOMEM; + } + + local->pending_array = CALLOC (sizeof (*local->pending_array), + priv->child_count); + if (!local->pending_array) { + return -ENOMEM; + } + + local->transaction.locked_nodes = CALLOC (sizeof (*local->transaction.locked_nodes), + priv->child_count); + + local->transaction.child_errno = CALLOC (sizeof (*local->transaction.child_errno), + priv->child_count); + + return 0; +} + +#endif /* __AFR_H__ */ |