diff options
Diffstat (limited to 'xlators/cluster')
121 files changed, 100981 insertions, 46621 deletions
diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am index 0990822a7d3..8e067d5ab58 100644 --- a/xlators/cluster/Makefile.am +++ b/xlators/cluster/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = stripe afr dht +SUBDIRS = afr dht ec CLEANFILES = diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am index e192b599bf4..610819b28fc 100644 --- a/xlators/cluster/afr/src/Makefile.am +++ b/xlators/cluster/afr/src/Makefile.am @@ -1,27 +1,35 @@ -xlator_LTLIBRARIES = afr.la pump.la +xlator_LTLIBRARIES = afr.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster -afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c afr-self-heal-algorithm.c afr-lk-common.c $(top_builddir)/xlators/lib/src/libxlator.c +afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \ + afr-inode-write.c afr-open.c afr-transaction.c afr-lk-common.c \ + afr-read-txn.c \ + $(top_builddir)/xlators/lib/src/libxlator.c -afr_la_LDFLAGS = -module -avoidversion -afr_la_SOURCES = $(afr_common_source) afr.c +AFR_SELFHEAL_SOURCES = afr-self-heal-common.c afr-self-heal-data.c \ + afr-self-heal-entry.c afr-self-heal-metadata.c afr-self-heald.c \ + afr-self-heal-name.c + +afr_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) +afr_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) afr.c afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -pump_la_LDFLAGS = -module -avoidversion -pump_la_SOURCES = $(afr_common_source) pump.c -pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \ + afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-mem-types.h \ + afr-common.c afr-self-heald.h \ + $(top_builddir)/xlators/lib/src/libxlator.h afr-messages.h -noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c $(top_builddir)/xlators/lib/src/libxlator.h +AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/contrib/md5 -shared -nostartfiles $(GF_CFLAGS) \ - -I$(top_srcdir)/xlators/lib/src +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = uninstall-local: rm -f $(DESTDIR)$(xlatordir)/replicate.so - rm -f $(DESTDIR)$(xlatordir)/pump.so install-data-hook: ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index ffd2200066f..032ab5c8001 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -24,28 +15,20 @@ #include <stdlib.h> #include <signal.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" +#include <glusterfs/glusterfs.h> #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" -#include "statedump.h" - -#include "fd.h" +#include <glusterfs/dict.h> +#include <glusterfs/hashfn.h> +#include <glusterfs/list.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/defaults.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/statedump.h> +#include <glusterfs/events.h> +#include <glusterfs/upcall-utils.h> #include "afr-inode-read.h" #include "afr-inode-write.h" @@ -53,2547 +36,7843 @@ #include "afr-dir-write.h" #include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "pump.h" - -#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000200000000ULL -#define AFR_ICTX_SPLIT_BRAIN_MASK 0x0000000100000000ULL -#define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL +#include "afr-self-heald.h" +#include "afr-messages.h" int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid) +afr_quorum_errno(afr_private_t *priv) { - int ret = 0; - - GF_ASSERT (gfid); + return ENOTCONN; +} - ret = dict_set_static_bin (dict, "gfid-req", gfid, 16); - if (ret) - gf_log (THIS->name, GF_LOG_DEBUG, "gfid set failed"); +gf_boolean_t +afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name, + pid_t pid) +{ + if (!__is_root_gfid(pargfid)) { + return _gf_false; + } + + if (strcmp(name, GF_REPLICATE_TRASH_DIR) == 0) { + /*For backward compatibility /.landfill is private*/ + return _gf_true; + } + + if (pid == GF_CLIENT_PID_GSYNCD) { + /*geo-rep needs to create/sync private directory on slave because + * it appears in changelog*/ + return _gf_false; + } + + if (pid == GF_CLIENT_PID_GLFS_HEAL || pid == GF_CLIENT_PID_SELF_HEALD) { + if (strcmp(name, priv->anon_inode_name) == 0) { + /* anonymous-inode dir is private*/ + return _gf_true; + } + } else { + if (strncmp(name, AFR_ANON_DIR_PREFIX, strlen(AFR_ANON_DIR_PREFIX)) == + 0) { + /* anonymous-inode dir prefix is private for geo-rep to work*/ + return _gf_true; + } + } - return ret; + return _gf_false; } -uint64_t -afr_is_split_brain (xlator_t *this, inode_t *inode) +void +afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, + unsigned char *replies) { - int ret = 0; + int i = 0; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].op_ret == 0) { + replies[i] = 1; + } else { + replies[i] = 0; + } + } +} - uint64_t ctx = 0; - uint64_t split_brain = 0; +int +afr_fav_child_reset_sink_xattrs(void *opaque); - VALIDATE_OR_GOTO (inode, out); +int +afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *frame, void *opaque); - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); +static void +afr_discover_done(call_frame_t *frame, xlator_t *this); - if (ret < 0) - goto unlock; +int +afr_dom_lock_acquire_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int i = (long)cookie; + + local->cont.lk.dom_lock_op_ret[i] = op_ret; + local->cont.lk.dom_lock_op_errno[i] = op_errno; + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to acquire %s on %s", + uuid_utoa(local->fd->inode->gfid), AFR_LK_HEAL_DOM, + priv->children[i]->name); + } else { + local->cont.lk.dom_locked_nodes[i] = 1; + } + + syncbarrier_wake(&local->barrier); + + return 0; +} - split_brain = ctx & AFR_ICTX_SPLIT_BRAIN_MASK; +int +afr_dom_lock_acquire(call_frame_t *frame) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct gf_flock flock = { + 0, + }; + int i = 0; + + priv = frame->this->private; + local = frame->local; + local->cont.lk.dom_locked_nodes = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.locked_nodes), + gf_afr_mt_char); + if (!local->cont.lk.dom_locked_nodes) { + return -ENOMEM; + } + local->cont.lk.dom_lock_op_ret = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.dom_lock_op_ret), + gf_afr_mt_int32_t); + if (!local->cont.lk.dom_lock_op_ret) { + return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */ + } + local->cont.lk.dom_lock_op_errno = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.dom_lock_op_errno), + gf_afr_mt_int32_t); + if (!local->cont.lk.dom_lock_op_errno) { + return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */ + } + flock.l_type = F_WRLCK; + + AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM, + local->fd, F_SETLK, &flock, NULL); + + if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) + goto blocking_lock; + + /*If any of the bricks returned EAGAIN, we still need blocking locks.*/ + if (AFR_COUNT(local->cont.lk.dom_locked_nodes, priv->child_count) != + priv->child_count) { + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lk.dom_lock_op_ret[i] == -1 && + local->cont.lk.dom_lock_op_errno[i] == EAGAIN) + goto blocking_lock; } -unlock: - UNLOCK (&inode->lock); + } -out: - return split_brain; + return 0; + +blocking_lock: + afr_dom_lock_release(frame); + AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM, + local->fd, F_SETLKW, &flock, NULL); + if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) { + afr_dom_lock_release(frame); + return -afr_quorum_errno(priv); + } + + return 0; } +int +afr_dom_lock_release_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int i = (long)cookie; + + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to release %s on %s", local->loc.path, + AFR_LK_HEAL_DOM, priv->children[i]->name); + } + local->cont.lk.dom_locked_nodes[i] = 0; + + syncbarrier_wake(&local->barrier); + + return 0; +} void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set) +afr_dom_lock_release(call_frame_t *frame) { - uint64_t ctx = 0; - int ret = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + struct gf_flock flock = { + 0, + }; + + local = frame->local; + priv = frame->this->private; + locked_on = local->cont.lk.dom_locked_nodes; + if (AFR_COUNT(locked_on, priv->child_count) == 0) + return; + flock.l_type = F_UNLCK; - VALIDATE_OR_GOTO (inode, out); + AFR_ONLIST(locked_on, frame, afr_dom_lock_release_cbk, finodelk, + AFR_LK_HEAL_DOM, local->fd, F_SETLK, &flock, NULL); - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); + return; +} - if (ret < 0) { - ctx = 0; - } +static void +afr_lk_heal_info_cleanup(afr_lk_heal_info_t *info) +{ + if (!info) + return; + if (info->xdata_req) + dict_unref(info->xdata_req); + if (info->fd) + fd_unref(info->fd); + GF_FREE(info->locked_nodes); + GF_FREE(info->child_up_event_gen); + GF_FREE(info->child_down_event_gen); + GF_FREE(info); +} - if (set) { - ctx = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx) - | (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK); - } else { - ctx = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx); - } - __inode_ctx_put (inode, this, ctx); +static int +afr_add_lock_to_saved_locks(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + afr_lk_heal_info_t *info = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = -ENOMEM; + + info = GF_CALLOC(sizeof(*info), 1, gf_afr_mt_lk_heal_info_t); + if (!info) { + goto cleanup; + } + INIT_LIST_HEAD(&info->pos); + info->fd = fd_ref(local->fd); + info->cmd = local->cont.lk.cmd; + info->pid = frame->root->pid; + info->flock = local->cont.lk.user_flock; + info->xdata_req = dict_copy_with_ref(local->xdata_req, NULL); + if (!info->xdata_req) { + goto cleanup; + } + info->lk_owner = frame->root->lk_owner; + info->locked_nodes = GF_MALLOC( + sizeof(*info->locked_nodes) * priv->child_count, gf_afr_mt_char); + if (!info->locked_nodes) { + goto cleanup; + } + memcpy(info->locked_nodes, local->cont.lk.locked_nodes, + sizeof(*info->locked_nodes) * priv->child_count); + info->child_up_event_gen = GF_CALLOC(sizeof(*info->child_up_event_gen), + priv->child_count, gf_afr_mt_int32_t); + if (!info->child_up_event_gen) { + goto cleanup; + } + info->child_down_event_gen = GF_CALLOC(sizeof(*info->child_down_event_gen), + priv->child_count, + gf_afr_mt_int32_t); + if (!info->child_down_event_gen) { + goto cleanup; + } + + LOCK(&local->fd->lock); + { + fd_ctx = __afr_fd_ctx_get(local->fd, this); + if (fd_ctx) + fd_ctx->lk_heal_info = info; + } + UNLOCK(&local->fd->lock); + if (!fd_ctx) { + goto cleanup; + } + + LOCK(&priv->lock); + { + list_add_tail(&info->pos, &priv->saved_locks); + } + UNLOCK(&priv->lock); + + return 0; +cleanup: + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to add lock to healq", + uuid_utoa(local->fd->inode->gfid)); + if (info) { + afr_lk_heal_info_cleanup(info); + if (fd_ctx) { + LOCK(&local->fd->lock); + { + fd_ctx->lk_heal_info = NULL; + } + UNLOCK(&local->fd->lock); } - UNLOCK (&inode->lock); + } + return ret; +} + +static int +afr_remove_lock_from_saved_locks(afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = this->private; + struct gf_flock flock = local->cont.lk.user_flock; + afr_lk_heal_info_t *info = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = -EINVAL; + + fd_ctx = afr_fd_ctx_get(local->fd, this); + if (!fd_ctx || !fd_ctx->lk_heal_info) { + goto out; + } + + info = fd_ctx->lk_heal_info; + if ((info->flock.l_start != flock.l_start) || + (info->flock.l_whence != flock.l_whence) || + (info->flock.l_len != flock.l_len)) { + /*TODO: Compare lkowners too.*/ + goto out; + } + + LOCK(&priv->lock); + { + list_del(&fd_ctx->lk_heal_info->pos); + } + UNLOCK(&priv->lock); + + afr_lk_heal_info_cleanup(info); + fd_ctx->lk_heal_info = NULL; + ret = 0; out: - return; + if (ret) + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to remove lock from healq", + uuid_utoa(local->fd->inode->gfid)); + return ret; } +int +afr_lock_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = frame->local; + int i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "Failed to heal lock on child %d for %s", i, + uuid_utoa(local->fd->inode->gfid)); + } + syncbarrier_wake(&local->barrier); + return 0; +} -uint64_t -afr_is_opendir_done (xlator_t *this, inode_t *inode) +int +afr_getlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { - int ret = 0; + afr_local_t *local = frame->local; + int i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid)); + } else { + local->cont.lk.getlk_rsp[i] = *lock; + } + + syncbarrier_wake(&local->barrier); + return 0; +} - uint64_t ctx = 0; - uint64_t opendir_done = 0; +static gf_boolean_t +afr_does_lk_owner_match(call_frame_t *frame, afr_private_t *priv, + afr_lk_heal_info_t *info) +{ + int i = 0; + afr_local_t *local = frame->local; + struct gf_flock flock = { + 0, + }; + gf_boolean_t ret = _gf_true; + char *wind_on = alloca0(priv->child_count); + unsigned char *success_replies = alloca0(priv->child_count); + local->cont.lk.getlk_rsp = GF_CALLOC(sizeof(*local->cont.lk.getlk_rsp), + priv->child_count, gf_afr_mt_gf_lock); + + flock = info->flock; + for (i = 0; i < priv->child_count; i++) { + if (info->locked_nodes[i]) + wind_on[i] = 1; + } + + AFR_ONLIST(wind_on, frame, afr_getlk_cbk, lk, info->fd, F_GETLK, &flock, + info->xdata_req); + + afr_fill_success_replies(local, priv, success_replies); + if (AFR_COUNT(success_replies, priv->child_count) == 0) { + ret = _gf_false; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid || local->replies[i].op_ret != 0) + continue; + if (local->cont.lk.getlk_rsp[i].l_type == F_UNLCK) + continue; + /*TODO: Do we really need to compare lkowner if F_UNLCK is true?*/ + if (!is_same_lkowner(&local->cont.lk.getlk_rsp[i].l_owner, + &info->lk_owner)) { + ret = _gf_false; + break; + } + } +out: + afr_local_replies_wipe(local, priv); + GF_FREE(local->cont.lk.getlk_rsp); + local->cont.lk.getlk_rsp = NULL; + return ret; +} - VALIDATE_OR_GOTO (inode, out); +static void +afr_mark_fd_bad(fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); + if (!fd) + return; + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get(fd, this); + if (fd_ctx) { + fd_ctx->is_fd_bad = _gf_true; + fd_ctx->lk_heal_info = NULL; + } + } + UNLOCK(&fd->lock); +} - if (ret < 0) - goto unlock; +static void +afr_add_lock_to_lkhealq(afr_private_t *priv, afr_lk_heal_info_t *info) +{ + LOCK(&priv->lock); + { + list_del(&info->pos); + list_add_tail(&info->pos, &priv->lk_healq); + } + UNLOCK(&priv->lock); +} - opendir_done = ctx & AFR_ICTX_OPENDIR_DONE_MASK; +static void +afr_lock_heal_do(call_frame_t *frame, afr_private_t *priv, + afr_lk_heal_info_t *info) +{ + int i = 0; + int op_errno = 0; + int32_t *current_event_gen = NULL; + afr_local_t *local = frame->local; + xlator_t *this = frame->this; + char *wind_on = alloca0(priv->child_count); + gf_boolean_t retry = _gf_true; + + frame->root->pid = info->pid; + lk_owner_copy(&frame->root->lk_owner, &info->lk_owner); + + op_errno = -afr_dom_lock_acquire(frame); + if ((op_errno != 0)) { + goto release; + } + + if (!afr_does_lk_owner_match(frame, priv, info)) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_LK_HEAL_DOM, + "Ignoring lock heal for %s since lk-onwers mismatch. " + "Lock possibly pre-empted by another client.", + uuid_utoa(info->fd->inode->gfid)); + goto release; + } + + for (i = 0; i < priv->child_count; i++) { + if (info->locked_nodes[i]) + continue; + wind_on[i] = 1; + } + + current_event_gen = alloca(priv->child_count); + memcpy(current_event_gen, info->child_up_event_gen, + priv->child_count * sizeof *current_event_gen); + AFR_ONLIST(wind_on, frame, afr_lock_heal_cbk, lk, info->fd, info->cmd, + &info->flock, info->xdata_req); + + LOCK(&priv->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (!wind_on[i]) + continue; + if ((!local->replies[i].valid) || (local->replies[i].op_ret != 0)) { + continue; + } + + if ((current_event_gen[i] == info->child_up_event_gen[i]) && + (current_event_gen[i] > info->child_down_event_gen[i])) { + info->locked_nodes[i] = 1; + retry = _gf_false; + list_del_init(&info->pos); + list_add_tail(&info->pos, &priv->saved_locks); + } else { + /*We received subsequent child up/down events while heal was in + * progress; don't mark child as healed. Attempt again on the + * new child up*/ + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_LK_HEAL_DOM, + "Event gen mismatch: skipped healing lock on child %d " + "for %s.", + i, uuid_utoa(info->fd->inode->gfid)); + } } -unlock: - UNLOCK (&inode->lock); + } + UNLOCK(&priv->lock); + +release: + afr_dom_lock_release(frame); + if (retry) + afr_add_lock_to_lkhealq(priv, info); + return; +} -out: - return opendir_done; +static int +afr_lock_heal_done(int ret, call_frame_t *frame, void *opaque) +{ + STACK_DESTROY(frame->root); + return 0; } +static int +afr_lock_heal(void *opaque) +{ + call_frame_t *frame = (call_frame_t *)opaque; + call_frame_t *iter_frame = NULL; + xlator_t *this = frame->this; + afr_private_t *priv = this->private; + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; + struct list_head healq = { + 0, + }; + int ret = 0; + + iter_frame = afr_copy_frame(frame); + if (!iter_frame) { + return ENOMEM; + } + + INIT_LIST_HEAD(&healq); + LOCK(&priv->lock); + { + list_splice_init(&priv->lk_healq, &healq); + } + UNLOCK(&priv->lock); + + list_for_each_entry_safe(info, tmp, &healq, pos) + { + GF_ASSERT((AFR_COUNT(info->locked_nodes, priv->child_count) < + priv->child_count)); + ((afr_local_t *)(iter_frame->local))->fd = fd_ref(info->fd); + afr_lock_heal_do(iter_frame, priv, info); + AFR_STACK_RESET(iter_frame); + if (iter_frame->local == NULL) { + ret = ENOTCONN; + gf_msg(frame->this->name, GF_LOG_ERROR, ENOTCONN, + AFR_MSG_LK_HEAL_DOM, + "Aborting processing of lk_healq." + "Healing will be reattempted on next child up for locks " + "that are still in quorum."); + LOCK(&priv->lock); + { + list_add_tail(&healq, &priv->lk_healq); + } + UNLOCK(&priv->lock); + break; + } + } + + AFR_STACK_DESTROY(iter_frame); + return ret; +} -void -afr_set_opendir_done (xlator_t *this, inode_t *inode) +static int +__afr_lock_heal_synctask(xlator_t *this, afr_private_t *priv, int child) { - uint64_t ctx = 0; - int ret = 0; + int ret = 0; + call_frame_t *frame = NULL; + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; - VALIDATE_OR_GOTO (inode, out); + if (priv->shd.iamshd) + return 0; - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); + list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos) + { + info->child_up_event_gen[child] = priv->event_generation; + list_del_init(&info->pos); + list_add_tail(&info->pos, &priv->lk_healq); + } - if (ret < 0) { - ctx = 0; - } + frame = create_frame(this, this->ctx->pool); + if (!frame) + return -1; - ctx = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx) - | (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK); + ret = synctask_new(this->ctx->env, afr_lock_heal, afr_lock_heal_done, frame, + frame); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_LK_HEAL_DOM, + "Failed to launch lock heal synctask"); - __inode_ctx_put (inode, this, ctx); - } - UNLOCK (&inode->lock); -out: - return; + return ret; } - -uint64_t -afr_read_child (xlator_t *this, inode_t *inode) +static int +__afr_mark_pending_lk_heal(xlator_t *this, afr_private_t *priv, int child) { - int ret = 0; + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; - uint64_t ctx = 0; - uint64_t read_child = 0; - - VALIDATE_OR_GOTO (inode, out); + if (priv->shd.iamshd) + return 0; + list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos) + { + info->child_down_event_gen[child] = priv->event_generation; + if (info->locked_nodes[child] == 1) + info->locked_nodes[child] = 0; + if (!afr_has_quorum(info->locked_nodes, this, NULL)) { + /* Since the lock was lost on quorum no. of nodes, we should + * not attempt to heal it anymore. Some other client could have + * acquired the lock, modified data and released it and this + * client wouldn't know about it if we heal it.*/ + afr_mark_fd_bad(info->fd, this); + list_del(&info->pos); + afr_lk_heal_info_cleanup(info); + /* We're not winding an unlock on the node where the lock is still + * present because when fencing logic switches over to the new + * client (since we marked the fd bad), it should preempt any + * existing lock. */ + } + } + return 0; +} - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); +gf_boolean_t +afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv, + int32_t *op_errno) +{ + if (priv->consistent_io && local->call_count != priv->child_count) { + gf_msg(THIS->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOLS_DOWN, + "All subvolumes are not up"); + if (op_errno) + *op_errno = ENOTCONN; + return _gf_false; + } + return _gf_true; +} - if (ret < 0) - goto unlock; +gf_boolean_t +afr_is_lock_mode_mandatory(dict_t *xdata) +{ + int ret = 0; + uint32_t lk_mode = GF_LK_ADVISORY; - read_child = ctx & AFR_ICTX_READ_CHILD_MASK; - } -unlock: - UNLOCK (&inode->lock); + ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_mode); + if (!ret && lk_mode == GF_LK_MANDATORY) + return _gf_true; -out: - return read_child; + return _gf_false; } +call_frame_t * +afr_copy_frame(call_frame_t *base) +{ + afr_local_t *local = NULL; + call_frame_t *frame = NULL; + int op_errno = 0; + + frame = copy_frame(base); + if (!frame) + return NULL; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) { + AFR_STACK_DESTROY(frame); + return NULL; + } + + return frame; +} -void -afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child) +/* Check if an entry or inode could be undergoing a transaction. */ +gf_boolean_t +afr_is_possibly_under_txn(afr_transaction_type type, afr_local_t *local, + xlator_t *this) { - uint64_t ctx = 0; - int ret = 0; + int i = 0; + int tmp = 0; + afr_private_t *priv = NULL; + GF_UNUSED char *key = NULL; + int keylen = 0; + + priv = this->private; + + if (type == AFR_ENTRY_TRANSACTION) { + key = GLUSTERFS_PARENT_ENTRYLK; + keylen = SLEN(GLUSTERFS_PARENT_ENTRYLK); + } else if (type == AFR_DATA_TRANSACTION) { + /*FIXME: Use GLUSTERFS_INODELK_DOM_COUNT etc. once + * pl_inodelk_xattr_fill supports separate keys for different + * domains.*/ + key = GLUSTERFS_INODELK_COUNT; + keylen = SLEN(GLUSTERFS_INODELK_COUNT); + } + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].xdata) + continue; + if (dict_get_int32n(local->replies[i].xdata, key, keylen, &tmp) == 0) + if (tmp) + return _gf_true; + } + + return _gf_false; +} - VALIDATE_OR_GOTO (inode, out); +static void +afr_inode_ctx_destroy(afr_inode_ctx_t *ctx) +{ + int i = 0; - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); + if (!ctx) + return; - if (ret < 0) { - ctx = 0; - } + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + GF_FREE(ctx->pre_op_done[i]); + } - ctx = (~AFR_ICTX_READ_CHILD_MASK & ctx) - | (AFR_ICTX_READ_CHILD_MASK & read_child); + GF_FREE(ctx); +} - __inode_ctx_put (inode, this, ctx); +int +__afr_inode_ctx_get(xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx) +{ + uint64_t ctx_int = 0; + int ret = -1; + int i = -1; + int num_locks = -1; + afr_inode_ctx_t *ictx = NULL; + afr_lock_t *lock = NULL; + afr_private_t *priv = this->private; + + ret = __inode_ctx_get(inode, this, &ctx_int); + if (ret == 0) { + *ctx = (afr_inode_ctx_t *)(uintptr_t)ctx_int; + return 0; + } + + ictx = GF_CALLOC(1, sizeof(afr_inode_ctx_t), gf_afr_mt_inode_ctx_t); + if (!ictx) + goto out; + + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + ictx->pre_op_done[i] = GF_CALLOC(sizeof *ictx->pre_op_done[i], + priv->child_count, gf_afr_mt_int32_t); + if (!ictx->pre_op_done[i]) { + ret = -ENOMEM; + goto out; } - UNLOCK (&inode->lock); - + } + + num_locks = sizeof(ictx->lock) / sizeof(afr_lock_t); + for (i = 0; i < num_locks; i++) { + lock = &ictx->lock[i]; + INIT_LIST_HEAD(&lock->post_op); + INIT_LIST_HEAD(&lock->frozen); + INIT_LIST_HEAD(&lock->waiting); + INIT_LIST_HEAD(&lock->owners); + } + + ctx_int = (uint64_t)(uintptr_t)ictx; + ret = __inode_ctx_set(inode, this, &ctx_int); + if (ret) { + goto out; + } + + ictx->spb_choice = -1; + ictx->read_subvol = 0; + ictx->write_subvol = 0; + ictx->lock_count = 0; + ret = 0; + *ctx = ictx; out: - return; + if (ret) { + afr_inode_ctx_destroy(ictx); + } + return ret; } - -/** - * afr_local_cleanup - cleanup everything in frame->local +/* + * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS: + * + * |<---------- 64bit ------------>| + * 63 32 31 16 15 0 + * | EVENT_GEN | DATA | METADATA | + * + * + * METADATA (bit-0 .. bit-15): bitmap representing subvolumes from which + * metadata can be attempted to be read. + * + * bit-0 => priv->subvolumes[0] + * bit-1 => priv->subvolumes[1] + * ... etc. till bit-15 + * + * DATA (bit-16 .. bit-31): bitmap representing subvolumes from which data + * can be attempted to be read. + * + * bit-16 => priv->subvolumes[0] + * bit-17 => priv->subvolumes[1] + * ... etc. till bit-31 + * + * EVENT_GEN (bit-32 .. bit-63): event generation (i.e priv->event_generation) + * when DATA and METADATA was last updated. + * + * If EVENT_GEN is < priv->event_generation, + * or is 0, it means afr_inode_refresh() needs + * to be called to recalculate the bitmaps. */ -void -afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) +int +__afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local, + inode_t *inode) { - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - + int i = 0; + int txn_type = 0; + int count = 0; + int index = -1; + uint16_t datamap_old = 0; + uint16_t metadatamap_old = 0; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint16_t tmp_map = 0; + uint16_t mask = 0; + uint32_t event = 0; + uint64_t val = 0; + afr_private_t *priv = NULL; + + priv = this->private; + txn_type = local->transaction.type; + + if (txn_type == AFR_DATA_TRANSACTION) + val = local->inode_ctx->write_subvol; + else + val = local->inode_ctx->read_subvol; + + metadatamap_old = metadatamap = (val & 0x000000000000ffff); + datamap_old = datamap = (val & 0x00000000ffff0000) >> 16; + event = (val & 0xffffffff00000000) >> 32; + + if (txn_type == AFR_DATA_TRANSACTION) + tmp_map = datamap; + else if (txn_type == AFR_METADATA_TRANSACTION) + tmp_map = metadatamap; + + count = gf_bits_count(tmp_map); + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.failed_subvols[i]) + continue; + + mask = 1 << i; + if (txn_type == AFR_METADATA_TRANSACTION) + metadatamap &= ~mask; + else if (txn_type == AFR_DATA_TRANSACTION) + datamap &= ~mask; + } + + switch (txn_type) { + case AFR_METADATA_TRANSACTION: + if ((metadatamap_old != 0) && (metadatamap == 0) && (count == 1)) { + index = gf_bits_index(tmp_map); + local->transaction.in_flight_sb_errno = local->replies[index] + .op_errno; + local->transaction.in_flight_sb = _gf_true; + metadatamap |= (1 << index); + } + if (metadatamap_old != metadatamap) { + __afr_inode_need_refresh_set(inode, this); + } + break; + + case AFR_DATA_TRANSACTION: + if ((datamap_old != 0) && (datamap == 0) && (count == 1)) { + index = gf_bits_index(tmp_map); + local->transaction.in_flight_sb_errno = local->replies[index] + .op_errno; + local->transaction.in_flight_sb = _gf_true; + datamap |= (1 << index); + } + if (datamap_old != datamap) + __afr_inode_need_refresh_set(inode, this); + break; - sh = &local->self_heal; - priv = this->private; + default: + break; + } - if (sh->buf) - GF_FREE (sh->buf); + val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) | + (((uint64_t)event) << 32); - if (sh->xattr) { - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - } - GF_FREE (sh->xattr); - } + if (txn_type == AFR_DATA_TRANSACTION) + local->inode_ctx->write_subvol = val; + local->inode_ctx->read_subvol = val; - if (sh->child_errno) - GF_FREE (sh->child_errno); + return 0; +} - if (sh->pending_matrix) { - for (i = 0; i < priv->child_count; i++) { - GF_FREE (sh->pending_matrix[i]); - } - GF_FREE (sh->pending_matrix); +gf_boolean_t +afr_is_symmetric_error(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int op_errno = 0; + int i_errno = 0; + gf_boolean_t matching_errors = _gf_true; + int i = 0; + + priv = this->private; + local = frame->local; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret != -1) { + /* Operation succeeded on at least one subvol, + so it is not a failed-everywhere situation. + */ + matching_errors = _gf_false; + break; + } + i_errno = local->replies[i].op_errno; + + if (i_errno == ENOTCONN) { + /* ENOTCONN is not a symmetric error. We do not + know if the operation was performed on the + backend or not. + */ + matching_errors = _gf_false; + break; } - if (sh->delta_matrix) { - for (i = 0; i < priv->child_count; i++) { - GF_FREE (sh->delta_matrix[i]); - } - GF_FREE (sh->delta_matrix); + if (!op_errno) { + op_errno = i_errno; + } else if (op_errno != i_errno) { + /* Mismatching op_errno's */ + matching_errors = _gf_false; + break; } + } - if (sh->sources) - GF_FREE (sh->sources); + return matching_errors; +} - if (sh->success) - GF_FREE (sh->success); +int +afr_set_in_flight_sb_status(xlator_t *this, call_frame_t *frame, inode_t *inode) +{ + int ret = -1; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - if (sh->locked_nodes) - GF_FREE (sh->locked_nodes); + priv = this->private; + local = frame->local; - if (sh->healing_fd && !sh->healing_fd_opened) { - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - } + /* If this transaction saw no failures, then exit. */ + if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count) == 0) + return 0; - if (sh->linkname) - GF_FREE ((char *)sh->linkname); + if (afr_is_symmetric_error(frame, this)) + return 0; - loc_wipe (&sh->parent_loc); -} + LOCK(&inode->lock); + { + ret = __afr_set_in_flight_sb_status(this, local, inode); + } + UNLOCK(&inode->lock); + return ret; +} -void -afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) +int +__afr_inode_read_subvol_get_small(inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) { - int i = 0; - afr_private_t * priv = NULL; + afr_private_t *priv = NULL; + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; + int i = 0; + afr_inode_ctx_t *ctx = NULL; + + priv = this->private; + + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret < 0) + return ret; - priv = this->private; + val = ctx->read_subvol; - for (i = 0; i < priv->child_count; i++) { - if (local->pending && local->pending[i]) - GF_FREE (local->pending[i]); - } + metadatamap = (val & 0x000000000000ffff); + datamap = (val & 0x00000000ffff0000) >> 16; + event = (val & 0xffffffff00000000) >> 32; - GF_FREE (local->pending); + for (i = 0; i < priv->child_count; i++) { + if (metadata) + metadata[i] = (metadatamap >> i) & 1; + if (data) + data[i] = (datamap >> i) & 1; + } - if (local->internal_lock.locked_nodes) - GF_FREE (local->internal_lock.locked_nodes); + if (event_p) + *event_p = event; + return ret; +} - if (local->internal_lock.inode_locked_nodes) - GF_FREE (local->internal_lock.inode_locked_nodes); +int +__afr_inode_read_subvol_set_small(inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int event) +{ + afr_private_t *priv = NULL; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint64_t val = 0; + int i = 0; + int ret = -1; + afr_inode_ctx_t *ctx = NULL; - if (local->internal_lock.entry_locked_nodes) - GF_FREE (local->internal_lock.entry_locked_nodes); + priv = this->private; - if (local->internal_lock.lower_locked_nodes) - GF_FREE (local->internal_lock.lower_locked_nodes); + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret) + goto out; + for (i = 0; i < priv->child_count; i++) { + if (data[i]) + datamap |= (1 << i); + if (metadata[i]) + metadatamap |= (1 << i); + } - GF_FREE (local->transaction.child_errno); - GF_FREE (local->child_errno); + val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) | + (((uint64_t)event) << 32); - GF_FREE (local->transaction.basename); - GF_FREE (local->transaction.new_basename); + ctx->read_subvol = val; - loc_wipe (&local->transaction.parent_loc); - loc_wipe (&local->transaction.new_parent_loc); + ret = 0; +out: + return ret; } - -void -afr_local_cleanup (afr_local_t *local, xlator_t *this) +int +__afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int *event_p) { - int i; - afr_private_t * priv = NULL; + afr_private_t *priv = NULL; + int ret = -1; - if (!local) - return; + priv = this->private; - afr_local_sh_cleanup (local, this); + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_get_small(inode, this, data, metadata, + event_p); + else + /* TBD: allocate structure with array and read from it */ + ret = -1; - afr_local_transaction_cleanup (local, this); + return ret; +} - priv = this->private; +int +__afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this, + int *spb_choice) +{ + afr_inode_ctx_t *ctx = NULL; + int ret = -1; - loc_wipe (&local->loc); - loc_wipe (&local->newloc); + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret < 0) + return ret; - if (local->fd) - fd_unref (local->fd); - - if (local->xattr_req) - dict_unref (local->xattr_req); - - GF_FREE (local->child_up); - - { /* lookup */ - if (local->cont.lookup.xattrs) { - for (i = 0; i < priv->child_count; i++) { - if (local->cont.lookup.xattrs[i]) { - dict_unref (local->cont.lookup.xattrs[i]); - local->cont.lookup.xattrs[i] = NULL; - } - } - GF_FREE (local->cont.lookup.xattrs); - local->cont.lookup.xattrs = NULL; - } + *spb_choice = ctx->spb_choice; + return 0; +} - if (local->cont.lookup.xattr) { - dict_unref (local->cont.lookup.xattr); - } +int +__afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) +{ + afr_private_t *priv = NULL; + int ret = -1; - if (local->cont.lookup.inode) { - inode_unref (local->cont.lookup.inode); - } - } + priv = this->private; - { /* getxattr */ - if (local->cont.getxattr.name) - GF_FREE (local->cont.getxattr.name); - } + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_set_small(inode, this, data, metadata, + event); + else + ret = -1; - { /* lk */ - if (local->cont.lk.locked_nodes) - GF_FREE (local->cont.lk.locked_nodes); - } + return ret; +} - { /* create */ - if (local->cont.create.fd) - fd_unref (local->cont.create.fd); - if (local->cont.create.params) - dict_unref (local->cont.create.params); - } +int +__afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this, + int spb_choice) +{ + afr_inode_ctx_t *ctx = NULL; + int ret = -1; - { /* mknod */ - if (local->cont.mknod.params) - dict_unref (local->cont.mknod.params); - } + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret) + goto out; - { /* mkdir */ - if (local->cont.mkdir.params) - dict_unref (local->cont.mkdir.params); - } + ctx->spb_choice = spb_choice; - { /* symlink */ - if (local->cont.symlink.params) - dict_unref (local->cont.symlink.params); - } + ret = 0; +out: + return ret; +} - { /* writev */ - GF_FREE (local->cont.writev.vector); - } +int +afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int *event_p) +{ + int ret = -1; - { /* setxattr */ - if (local->cont.setxattr.dict) - dict_unref (local->cont.setxattr.dict); - } + GF_VALIDATE_OR_GOTO(this->name, inode, out); - { /* removexattr */ - GF_FREE (local->cont.removexattr.name); - } + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_get(inode, this, data, metadata, event_p); + } + UNLOCK(&inode->lock); +out: + return ret; +} - { /* symlink */ - GF_FREE (local->cont.symlink.linkpath); +int +afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, int type) +{ + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + unsigned char *data = alloca0(priv->child_count); + unsigned char *metadata = alloca0(priv->child_count); + int data_count = 0; + int metadata_count = 0; + int event_generation = 0; + int ret = 0; + + ret = afr_inode_read_subvol_get(inode, this, data, metadata, + &event_generation); + if (ret == -1) + return -EIO; + + data_count = AFR_COUNT(data, priv->child_count); + metadata_count = AFR_COUNT(metadata, priv->child_count); + + if (inode->ia_type == IA_IFDIR) { + /* For directories, allow even if it is in data split-brain. */ + if (type == AFR_METADATA_TRANSACTION || local->op == GF_FOP_STAT || + local->op == GF_FOP_FSTAT) { + if (!metadata_count) + return -EIO; } - - { /* opendir */ - if (local->cont.opendir.checksum) - GF_FREE (local->cont.opendir.checksum); + } else { + /* For files, abort in case of data/metadata split-brain. */ + if (!data_count || !metadata_count) { + return -EIO; } + } + + if (type == AFR_METADATA_TRANSACTION && readable) + memcpy(readable, metadata, priv->child_count * sizeof *metadata); + if (type == AFR_DATA_TRANSACTION && readable) { + if (!data_count) + memcpy(readable, local->child_up, + priv->child_count * sizeof *readable); + else + memcpy(readable, data, priv->child_count * sizeof *data); + } + if (event_p) + *event_p = event_generation; + return 0; } +static int +afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this, + int *spb_choice) +{ + int ret = -1; + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + LOCK(&inode->lock); + { + ret = __afr_inode_split_brain_choice_get(inode, this, spb_choice); + } + UNLOCK(&inode->lock); +out: + return ret; +} +/* + * frame is used to get the favourite policy. Since + * afr_inode_split_brain_choice_get was called with afr_open, it is possible to + * have a frame with out local->replies. So in that case, frame is passed as + * null, hence this function will handle the frame NULL case. + */ int -afr_frame_return (call_frame_t *frame) +afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this, + call_frame_t *frame, int *spb_subvol) { - afr_local_t *local = NULL; - int call_count = 0; + int ret = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; + GF_VALIDATE_OR_GOTO("afr", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, spb_subvol, out); - LOCK (&frame->lock); - { - call_count = --local->call_count; + priv = this->private; + + ret = afr_inode_split_brain_choice_get(inode, this, spb_subvol); + if (*spb_subvol < 0 && priv->fav_child_policy && frame && frame->local) { + local = frame->local; + *spb_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode, + NULL); + if (*spb_subvol >= 0) { + ret = 0; } - UNLOCK (&frame->lock); + } - return call_count; +out: + return ret; } +int +afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) +{ + int ret = -1; + GF_VALIDATE_OR_GOTO(this->name, inode, out); -/** - * up_children_count - return the number of children that are up - */ + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_set(inode, this, data, metadata, event); + } + UNLOCK(&inode->lock); +out: + return ret; +} int -afr_up_children_count (int child_count, unsigned char *child_up) +afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this, int spb_choice) { - int i = 0; - int ret = 0; + int ret = -1; - for (i = 0; i < child_count; i++) - if (child_up[i]) - ret++; - return ret; -} + GF_VALIDATE_OR_GOTO(this->name, inode, out); + LOCK(&inode->lock); + { + ret = __afr_inode_split_brain_choice_set(inode, this, spb_choice); + } + UNLOCK(&inode->lock); +out: + return ret; +} -ino64_t -afr_itransform (ino64_t ino, int child_count, int child_index) +/* The caller of this should perform afr_inode_refresh, if this function + * returns _gf_true + */ +gf_boolean_t +afr_is_inode_refresh_reqd(inode_t *inode, xlator_t *this, int event_gen1, + int event_gen2) { - ino64_t scaled_ino = -1; + gf_boolean_t need_refresh = _gf_false; + afr_inode_ctx_t *ctx = NULL; + int ret = -1; - if (ino == ((uint64_t) -1)) { - scaled_ino = ((uint64_t) -1); - goto out; - } + GF_VALIDATE_OR_GOTO(this->name, inode, out); - scaled_ino = (ino * child_count) + child_index; + LOCK(&inode->lock); + { + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret) + goto unlock; + need_refresh = ctx->need_refresh; + /* Hoping that the caller will do inode_refresh followed by + * this, hence setting the need_refresh to false */ + ctx->need_refresh = _gf_false; + } +unlock: + UNLOCK(&inode->lock); + + if (event_gen1 != event_gen2) + need_refresh = _gf_true; out: - return scaled_ino; + return need_refresh; } - int -afr_deitransform_orig (ino64_t ino, int child_count) +__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) { - int index = -1; + int ret = -1; + afr_inode_ctx_t *ctx = NULL; - index = ino % child_count; + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret == 0) { + ctx->need_refresh = _gf_true; + } - return index; + return ret; } - int -afr_deitransform (ino64_t ino, int child_count) +afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) { - return 0; -} + int ret = -1; + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + LOCK(&inode->lock); + { + ret = __afr_inode_need_refresh_set(inode, this); + } + UNLOCK(&inode->lock); +out: + return ret; +} int -afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this) +afr_spb_choice_timeout_cancel(xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; + afr_inode_ctx_t *ctx = NULL; + int ret = -1; - local = frame->local; + if (!inode) + return ret; - if (local->govinda_gOvinda) { - afr_set_split_brain (this, local->cont.lookup.inode, _gf_true); + LOCK(&inode->lock); + { + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret < 0 || !ctx) { + UNLOCK(&inode->lock); + gf_msg(this->name, GF_LOG_WARNING, 0, + AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, + "Failed to cancel split-brain choice timer."); + goto out; + } + ctx->spb_choice = -1; + if (ctx->timer) { + gf_timer_call_cancel(this->ctx, ctx->timer); + ctx->timer = NULL; } + ret = 0; + } + UNLOCK(&inode->lock); +out: + return ret; +} - AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); +void +afr_set_split_brain_choice_cbk(void *data) +{ + inode_t *inode = data; + xlator_t *this = THIS; - return 0; + afr_spb_choice_timeout_cancel(this, inode); + inode_invalidate(inode); + inode_unref(inode); + return; } - -static void -afr_lookup_collect_xattr (afr_local_t *local, xlator_t *this, - int child_index, dict_t *xattr) +int +afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque) { - uint32_t inodelk_count = 0; - uint32_t entrylk_count = 0; + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + afr_inode_ctx_t *ctx = NULL; + inode_t *inode = NULL; + loc_t *loc = NULL; + xlator_t *this = NULL; + afr_spbc_timeout_t *data = opaque; + struct timespec delta = { + 0, + }; + gf_boolean_t timer_set = _gf_false; + gf_boolean_t timer_cancelled = _gf_false; + gf_boolean_t timer_reset = _gf_false; + int old_spb_choice = -1; + + frame = data->frame; + loc = data->loc; + this = frame->this; + priv = this->private; + + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + + delta.tv_sec = priv->spb_choice_timeout; + delta.tv_nsec = 0; + + if (!loc->inode) { + ret = -1; + op_errno = EINVAL; + goto out; + } + + if (!(data->d_spb || data->m_spb)) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, + "Cannot set " + "replica.split-brain-choice on %s. File is" + " not in data/metadata split-brain.", + uuid_utoa(loc->gfid)); + ret = -1; + op_errno = EINVAL; + goto out; + } + + /* + * we're ref'ing the inode before LOCK like it is done elsewhere in the + * code. If we ref after LOCK, coverity complains of possible deadlocks. + */ + inode = inode_ref(loc->inode); + + LOCK(&inode->lock); + { + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret) { + UNLOCK(&inode->lock); + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, + "Failed to get inode_ctx for %s", loc->name); + goto post_unlock; + } - int ret = 0; + old_spb_choice = ctx->spb_choice; + ctx->spb_choice = data->spb_child_index; + + /* Possible changes in spb-choice : + * valid to -1 : cancel timer and unref + * valid to valid : cancel timer and inject new one + * -1 to -1 : unref and do not do anything + * -1 to valid : inject timer + */ + + /* ctx->timer is NULL iff previous value of + * ctx->spb_choice is -1 + */ + if (ctx->timer) { + if (ctx->spb_choice == -1) { + if (!gf_timer_call_cancel(this->ctx, ctx->timer)) { + ctx->timer = NULL; + timer_cancelled = _gf_true; + } + /* If timer cancel failed here it means that the + * previous cbk will be executed which will set + * spb_choice to -1. So we can consider the + * 'valid to -1' case to be a success + * (i.e. ret = 0) and goto unlock. + */ + goto unlock; + } + goto reset_timer; + } else { + if (ctx->spb_choice == -1) + goto unlock; + goto set_timer; + } - if (afr_sh_has_metadata_pending (xattr, child_index, this)) { - local->self_heal.need_metadata_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "metadata self-heal is pending for %s.", - local->loc.path); + reset_timer: + ret = gf_timer_call_cancel(this->ctx, ctx->timer); + if (ret != 0) { + /* We need to bail out now instead of launching a new + * timer. Otherwise the cbk of the previous timer event + * will cancel the new ctx->timer. + */ + ctx->spb_choice = old_spb_choice; + ret = -1; + op_errno = EAGAIN; + goto unlock; } + ctx->timer = NULL; + timer_reset = _gf_true; + + set_timer: + ctx->timer = gf_timer_call_after(this->ctx, delta, + afr_set_split_brain_choice_cbk, inode); + if (!ctx->timer) { + ctx->spb_choice = old_spb_choice; + ret = -1; + op_errno = ENOMEM; + } + if (!timer_reset && ctx->timer) + timer_set = _gf_true; + if (timer_reset && !ctx->timer) + timer_cancelled = _gf_true; + } +unlock: + UNLOCK(&inode->lock); +post_unlock: + if (!timer_set) + inode_unref(inode); + if (timer_cancelled) + inode_unref(inode); + /* + * We need to invalidate the inode to prevent the kernel from serving + * reads from an older cached value despite a change in spb_choice to + * a new value. + */ + inode_invalidate(inode); +out: + GF_FREE(data); + AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL); + return 0; +} + +int +afr_accused_fill(xlator_t *this, dict_t *xdata, unsigned char *accused, + afr_transaction_type type) +{ + afr_private_t *priv = NULL; + int i = 0; + int idx = afr_index_for_transaction_type(type); + void *pending_raw = NULL; + int pending[3]; + int ret = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + ret = dict_get_ptr(xdata, priv->pending_key[i], &pending_raw); + if (ret) /* no pending flags */ + continue; + memcpy(pending, pending_raw, sizeof(pending)); + + if (ntoh32(pending[idx])) + accused[i] = 1; + } + + return 0; +} + +int +afr_accuse_smallfiles(xlator_t *this, struct afr_reply *replies, + unsigned char *data_accused) +{ + int i = 0; + afr_private_t *priv = NULL; + uint64_t maxsize = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid && replies[i].xdata && + dict_get_sizen(replies[i].xdata, GLUSTERFS_BAD_INODE)) + continue; + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size > maxsize) + maxsize = replies[i].poststat.ia_size; + } + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (AFR_IS_ARBITER_BRICK(priv, i)) + continue; + if (replies[i].poststat.ia_size < maxsize) + data_accused[i] = 1; + } + + return 0; +} - if (afr_sh_has_entry_pending (xattr, child_index, this)) { - local->self_heal.need_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "entry self-heal is pending for %s.", local->loc.path); +int +afr_readables_fill(call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *data_accused, unsigned char *metadata_accused, + unsigned char *data_readable, + unsigned char *metadata_readable, struct afr_reply *replies) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + int i = 0; + int ret = 0; + ia_type_t ia_type = IA_INVAL; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + data_readable[i] = 1; + metadata_readable[i] = 1; + } + if (AFR_IS_ARBITER_BRICK(priv, ARBITER_BRICK_INDEX)) { + data_readable[ARBITER_BRICK_INDEX] = 0; + metadata_readable[ARBITER_BRICK_INDEX] = 0; + } + + for (i = 0; i < priv->child_count; i++) { + if (replies) { /* Lookup */ + if (!replies[i].valid || replies[i].op_ret == -1 || + (replies[i].xdata && + dict_get_sizen(replies[i].xdata, GLUSTERFS_BAD_INODE))) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } + + xdata = replies[i].xdata; + ia_type = replies[i].poststat.ia_type; + } else { /* pre-op xattrop */ + xdata = local->transaction.changelog_xdata[i]; + ia_type = inode->ia_type; } - if (afr_sh_has_data_pending (xattr, child_index, this)) { - local->self_heal.need_data_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "data self-heal is pending for %s.", local->loc.path); + if (!xdata) + continue; /* mkdir_cbk sends NULL xdata_rsp. */ + afr_accused_fill(this, xdata, data_accused, + (ia_type == IA_IFDIR) ? AFR_ENTRY_TRANSACTION + : AFR_DATA_TRANSACTION); + + afr_accused_fill(this, xdata, metadata_accused, + AFR_METADATA_TRANSACTION); + } + + if (replies && ia_type != IA_INVAL && ia_type != IA_IFDIR && + /* We want to accuse small files only when we know for + * sure that there is no IO happening. Otherwise, the + * ia_sizes obtained in post-refresh replies may + * mismatch due to a race between inode-refresh and + * ongoing writes, causing spurious heal launches*/ + !afr_is_possibly_under_txn(AFR_DATA_TRANSACTION, local, this)) { + afr_accuse_smallfiles(this, replies, data_accused); + } + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) { + data_readable[i] = 0; + ret = 1; + } + if (metadata_accused[i]) { + metadata_readable[i] = 0; + ret = 1; } + } + return ret; +} - ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, - &inodelk_count); - if (ret == 0) - local->inodelk_count += inodelk_count; +int +afr_replies_interpret(call_frame_t *frame, xlator_t *this, inode_t *inode, + gf_boolean_t *start_heal) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int event_generation = 0; + int i = 0; + unsigned char *data_accused = NULL; + unsigned char *metadata_accused = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + int ret = 0; + + local = frame->local; + priv = this->private; + replies = local->replies; + event_generation = local->event_generation; + + data_accused = alloca0(priv->child_count); + data_readable = alloca0(priv->child_count); + metadata_accused = alloca0(priv->child_count); + metadata_readable = alloca0(priv->child_count); + + ret = afr_readables_fill(frame, this, inode, data_accused, metadata_accused, + data_readable, metadata_readable, replies); + + for (i = 0; i < priv->child_count; i++) { + if (start_heal && priv->child_up[i] && + (data_accused[i] || metadata_accused[i])) { + *start_heal = _gf_true; + break; + } + } + afr_inode_read_subvol_set(inode, this, data_readable, metadata_readable, + event_generation); + return ret; +} - ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT, - &entrylk_count); - if (ret == 0) - local->entrylk_count += entrylk_count; +int +afr_refresh_selfheal_done(int ret, call_frame_t *heal, void *opaque) +{ + if (heal) + AFR_STACK_DESTROY(heal); + return 0; } +int +afr_inode_refresh_err(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int err = 0; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && !local->replies[i].op_ret) { + err = 0; + goto ret; + } + } -static void -afr_lookup_self_heal_check (xlator_t *this, afr_local_t *local, - struct iatt *buf, struct iatt *lookup_buf) + err = afr_final_errno(local, priv); +ret: + return err; +} + +gf_boolean_t +afr_selfheal_enabled(const xlator_t *this) { - if (FILETYPE_DIFFERS (buf, lookup_buf)) { - /* mismatching filetypes with same name - */ + const afr_private_t *priv = this->private; - gf_log (this->name, GF_LOG_NORMAL, - "filetype differs for %s ", local->loc.path); + return priv->data_self_heal || priv->metadata_self_heal || + priv->entry_self_heal; +} - local->govinda_gOvinda = 1; +int +afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) +{ + call_frame_t *heal_frame = NULL; + afr_local_t *heal_local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + inode_t *inode = NULL; + int event_generation = 0; + int read_subvol = -1; + int ret = 0; + + local = frame->local; + inode = local->inode; + priv = this->private; + + if (err) + goto refresh_done; + + if (local->op == GF_FOP_LOOKUP) + goto refresh_done; + + ret = afr_inode_get_readable(frame, inode, this, local->readable, + &event_generation, local->transaction.type); + + if (ret == -EIO) { + /* No readable subvolume even after refresh ==> splitbrain.*/ + if (!priv->fav_child_policy) { + err = EIO; + goto refresh_done; + } + read_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode, + NULL); + if (read_subvol == -1) { + err = EIO; + goto refresh_done; } - if (PERMISSION_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - gf_log (this->name, GF_LOG_NORMAL, - "permissions differ for %s ", local->loc.path); - local->self_heal.need_metadata_self_heal = _gf_true; + heal_frame = afr_frame_create(this, NULL); + if (!heal_frame) { + err = EIO; + goto refresh_done; + } + heal_local = heal_frame->local; + heal_local->xdata_req = dict_new(); + if (!heal_local->xdata_req) { + err = EIO; + AFR_STACK_DESTROY(heal_frame); + goto refresh_done; } + heal_local->heal_frame = frame; + ret = synctask_new(this->ctx->env, afr_fav_child_reset_sink_xattrs, + afr_fav_child_reset_sink_xattrs_cbk, heal_frame, + heal_frame); + return 0; + } + +refresh_done: + afr_local_replies_wipe(local, this->private); + local->refreshfn(frame, this, err); - if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - local->self_heal.need_metadata_self_heal = _gf_true; - gf_log (this->name, GF_LOG_NORMAL, - "ownership differs for %s ", local->loc.path); + return 0; +} + +int +afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error) +{ + call_frame_t *heal_frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_boolean_t start_heal = _gf_false; + afr_local_t *heal_local = NULL; + unsigned char *success_replies = NULL; + int ret = 0; + + if (error != 0) { + goto refresh_done; + } + + local = frame->local; + priv = this->private; + success_replies = alloca0(priv->child_count); + afr_fill_success_replies(local, priv, success_replies); + + if (priv->thin_arbiter_count && local->is_read_txn && + AFR_COUNT(success_replies, priv->child_count) != priv->child_count) { + /* We need to query the good bricks and/or thin-arbiter.*/ + if (success_replies[0]) { + local->read_txn_query_child = AFR_CHILD_ZERO; + } else if (success_replies[1]) { + local->read_txn_query_child = AFR_CHILD_ONE; + } + error = EINVAL; + goto refresh_done; + } + + if (!afr_has_quorum(success_replies, this, frame)) { + error = afr_final_errno(frame->local, this->private); + if (!error) + error = afr_quorum_errno(priv); + goto refresh_done; + } + + ret = afr_replies_interpret(frame, this, local->refreshinode, &start_heal); + + if (ret && afr_selfheal_enabled(this) && start_heal) { + heal_frame = afr_frame_create(this, NULL); + if (!heal_frame) + goto refresh_done; + heal_local = heal_frame->local; + heal_local->refreshinode = inode_ref(local->refreshinode); + heal_local->heal_frame = heal_frame; + if (!afr_throttled_selfheal(heal_frame, this)) { + AFR_STACK_DESTROY(heal_frame); + goto refresh_done; } + } - if (SIZE_DIFFERS (buf, lookup_buf) - && IA_ISREG (buf->ia_type)) { - gf_log (this->name, GF_LOG_NORMAL, - "size differs for %s ", local->loc.path); - local->self_heal.need_data_self_heal = _gf_true; +refresh_done: + afr_txn_refresh_done(frame, this, error); + + return 0; +} + +void +afr_inode_refresh_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *buf, + dict_t *xdata, struct iatt *par) +{ + afr_local_t *local = NULL; + int call_child = (long)cookie; + int8_t need_heal = 1; + int call_count = 0; + int ret = 0; + + local = frame->local; + local->replies[call_child].valid = 1; + local->replies[call_child].op_ret = op_ret; + local->replies[call_child].op_errno = op_errno; + if (op_ret != -1) { + local->replies[call_child].poststat = *buf; + if (par) + local->replies[call_child].postparent = *par; + if (xdata) + local->replies[call_child].xdata = dict_ref(xdata); + } + + if (xdata) { + ret = dict_get_int8(xdata, "link-count", &need_heal); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to get link count"); } + } + + local->replies[call_child].need_heal = need_heal; + call_count = afr_frame_return(frame); + if (call_count == 0) { + afr_set_need_heal(this, local); + ret = afr_inode_refresh_err(frame, this); + if (ret) { + gf_msg_debug(this->name, ret, "afr_inode_refresh_err failed"); + } + afr_inode_refresh_done(frame, this, ret); + } +} +int +afr_inode_refresh_subvol_with_lookup_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, + int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *par) +{ + afr_inode_refresh_subvol_cbk(frame, cookie, this, op_ret, op_errno, buf, + xdata, par); + return 0; } +int +afr_inode_refresh_subvol_with_lookup(call_frame_t *frame, xlator_t *this, int i, + inode_t *inode, uuid_t gfid, dict_t *xdata) +{ + loc_t loc = { + 0, + }; + afr_private_t *priv = NULL; + + priv = this->private; + + loc.inode = inode; + if (gf_uuid_is_null(inode->gfid) && gfid) { + /* To handle setattr/setxattr on yet to be linked inode from + * dht */ + gf_uuid_copy(loc.gfid, gfid); + } else { + gf_uuid_copy(loc.gfid, inode->gfid); + } + + STACK_WIND_COOKIE(frame, afr_inode_refresh_subvol_with_lookup_cbk, + (void *)(long)i, priv->children[i], + priv->children[i]->fops->lookup, &loc, xdata); + return 0; +} -static void -afr_lookup_done (call_frame_t *frame, xlator_t *this, struct iatt *lookup_buf) +int +afr_inode_refresh_subvol_with_fstat_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, + dict_t *xdata) { - int unwind = 1; - int source = -1; - int up_count = 0; - char sh_type_str[256] = {0,}; + afr_inode_refresh_subvol_cbk(frame, cookie, this, op_ret, op_errno, buf, + xdata, NULL); + return 0; +} - afr_private_t *priv = NULL; - afr_local_t *local = NULL; +int +afr_inode_refresh_subvol_with_fstat(call_frame_t *frame, xlator_t *this, int i, + dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - local->cont.lookup.postparent.ia_ino = local->cont.lookup.parent_ino; + STACK_WIND_COOKIE(frame, afr_inode_refresh_subvol_with_fstat_cbk, + (void *)(long)i, priv->children[i], + priv->children[i]->fops->fstat, local->fd, xdata); + return 0; +} - if (local->cont.lookup.ino) { - local->cont.lookup.buf.ia_ino = local->cont.lookup.ino; +int +afr_inode_refresh_do(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + int ret = 0; + dict_t *xdata = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + unsigned char *wind_subvols = NULL; + + priv = this->private; + local = frame->local; + wind_subvols = alloca0(priv->child_count); + + afr_local_replies_wipe(local, priv); + + if (local->fd) { + fd_ctx = afr_fd_ctx_get(local->fd, this); + if (!fd_ctx) { + afr_inode_refresh_done(frame, this, EINVAL); + return 0; } + } - if (local->op_ret == 0) { - /* KLUDGE: assuming DHT will not itransform in - revalidate */ - if (local->cont.lookup.inode->ino) { - local->cont.lookup.buf.ia_ino = - local->cont.lookup.inode->ino; - } - } - up_count = afr_up_children_count (priv->child_count, priv->child_up); - if (up_count == 1) { - gf_log (this->name, GF_LOG_DEBUG, - "Only 1 child up - do not attempt to detect self heal"); + xdata = dict_new(); + if (!xdata) { + afr_inode_refresh_done(frame, this, ENOMEM); + return 0; + } - goto unwind; - } + ret = afr_xattr_req_prepare(this, xdata); + if (ret != 0) { + dict_unref(xdata); + afr_inode_refresh_done(frame, this, -ret); + return 0; + } - if (local->success_count && local->enoent_count) { - local->self_heal.need_metadata_self_heal = _gf_true; - local->self_heal.need_data_self_heal = _gf_true; - local->self_heal.need_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_NORMAL, - "entries are missing in lookup of %s.", - local->loc.path); - } + ret = dict_set_sizen_str_sizen(xdata, "link-count", GF_XATTROP_INDEX_COUNT); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to set link-count in dict "); + } - if (local->success_count) { - /* check for split-brain case in previous lookup */ - if (afr_is_split_brain (this, - local->cont.lookup.inode)) { - local->self_heal.need_data_self_heal = _gf_true; - gf_log(this->name, GF_LOG_NORMAL, - "split brain detected during lookup of " - "%s.", local->loc.path); - } + ret = dict_set_str_sizen(xdata, GLUSTERFS_INODELK_DOM_COUNT, this->name); + if (ret) { + gf_msg_debug(this->name, -ret, + "Unable to set inodelk-dom-count in dict "); + } + + if (local->fd) { + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] && fd_ctx->opened_on[i] == AFR_FD_OPENED) + wind_subvols[i] = 1; } + } else { + memcpy(wind_subvols, local->child_up, + sizeof(*local->child_up) * priv->child_count); + } + + local->call_count = AFR_COUNT(wind_subvols, priv->child_count); + + call_count = local->call_count; + if (!call_count) { + dict_unref(xdata); + if (local->fd && AFR_COUNT(local->child_up, priv->child_count)) + afr_inode_refresh_done(frame, this, EBADFD); + else + afr_inode_refresh_done(frame, this, ENOTCONN); + return 0; + } + for (i = 0; i < priv->child_count; i++) { + if (!wind_subvols[i]) + continue; + + if (local->fd) + afr_inode_refresh_subvol_with_fstat(frame, this, i, xdata); + else + afr_inode_refresh_subvol_with_lookup( + frame, this, i, local->refreshinode, local->refreshgfid, xdata); - if ((local->self_heal.need_metadata_self_heal - || local->self_heal.need_data_self_heal - || local->self_heal.need_entry_self_heal) - && ((!local->cont.lookup.is_revalidate) - || (local->op_ret != -1))) { + if (!--call_count) + break; + } - if (local->inodelk_count || local->entrylk_count) { + dict_unref(xdata); - /* Someone else is doing self-heal on this file. - So just make a best effort to set the read-subvolume - and return */ + return 0; +} - if (IA_ISREG (local->cont.lookup.inode->ia_type)) { - source = afr_self_heal_get_source (this, local, local->cont.lookup.xattrs); +int +afr_inode_refresh(call_frame_t *frame, xlator_t *this, inode_t *inode, + uuid_t gfid, afr_inode_refresh_cbk_t refreshfn) +{ + afr_local_t *local = NULL; - if (source >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - source); - } - } - } else { - if (!local->cont.lookup.inode->ia_type) { - /* fix for RT #602 */ - local->cont.lookup.inode->ia_type = - lookup_buf->ia_type; - } + local = frame->local; - local->self_heal.background = _gf_true; - local->self_heal.type = local->cont.lookup.buf.ia_type; - local->self_heal.unwind = afr_self_heal_lookup_unwind; + local->refreshfn = refreshfn; - unwind = 0; + if (local->refreshinode) { + inode_unref(local->refreshinode); + local->refreshinode = NULL; + } - afr_self_heal_type_str_get(&local->self_heal, - sh_type_str, - sizeof(sh_type_str)); + local->refreshinode = inode_ref(inode); - gf_log (this->name, GF_LOG_NORMAL, "background %s " - "self-heal triggered. path: %s", - sh_type_str, local->loc.path); + if (gfid) + gf_uuid_copy(local->refreshgfid, gfid); + else + gf_uuid_clear(local->refreshgfid); - afr_self_heal (frame, this); - } - } + afr_inode_refresh_do(frame, this); -unwind: - if (unwind) { - AFR_STACK_UNWIND (lookup, frame, local->op_ret, - local->op_errno, - local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); - } + return 0; } +int +afr_xattr_req_prepare(xlator_t *this, dict_t *xattr_req) +{ + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; + + priv = this->private; -/* - * During a lookup, some errors are more "important" than - * others in that they must be given higher priority while - * returning to the user. - * - * The hierarchy is ESTALE > ENOENT > others - * - */ + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_uint64(xattr_req, priv->pending_key[i], + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret < 0) + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "Unable to set dict value for %s", priv->pending_key[i]); + /* 3 = data+metadata+entry */ + } + ret = dict_set_uint64(xattr_req, AFR_DIRTY, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) { + gf_msg_debug(this->name, -ret, + "failed to set dirty " + "query flag"); + } + + ret = dict_set_int32_sizen(xattr_req, "list-xattr", 1); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to set list-xattr in dict "); + } + + return ret; +} -static gf_boolean_t -__error_more_important (int32_t old_errno, int32_t new_errno) +int +afr_lookup_xattr_req_prepare(afr_local_t *local, xlator_t *this, + dict_t *xattr_req, loc_t *loc) { - gf_boolean_t ret = _gf_true; + int ret = -ENOMEM; + + if (!local->xattr_req) + local->xattr_req = dict_new(); + + if (!local->xattr_req) + goto out; + + if (xattr_req && (xattr_req != local->xattr_req)) + dict_copy(xattr_req, local->xattr_req); + + ret = afr_xattr_req_prepare(this, local->xattr_req); + + ret = dict_set_uint64(local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "%s: Unable to set dict value for %s", loc->path, + GLUSTERFS_INODELK_COUNT); + } + ret = dict_set_uint64(local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "%s: Unable to set dict value for %s", loc->path, + GLUSTERFS_ENTRYLK_COUNT); + } + + ret = dict_set_uint32(local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "%s: Unable to set dict value for %s", loc->path, + GLUSTERFS_PARENT_ENTRYLK); + } + + ret = dict_set_sizen_str_sizen(local->xattr_req, "link-count", + GF_XATTROP_INDEX_COUNT); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to set link-count in dict "); + } + + ret = 0; +out: + return ret; +} - /* Nothing should ever overwrite ESTALE */ - if (old_errno == ESTALE) - ret = _gf_false; +int +afr_least_pending_reads_child(afr_private_t *priv, unsigned char *readable) +{ + int i = 0; + int child = -1; + int64_t read_iter = -1; + int64_t pending_read = -1; + + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i]) + continue; + read_iter = GF_ATOMIC_GET(priv->pending_reads[i]); + if (child == -1 || read_iter < pending_read) { + pending_read = read_iter; + child = i; + } + } - /* Nothing should overwrite ENOENT, except ESTALE */ - else if ((old_errno == ENOENT) && (new_errno != ESTALE)) - ret = _gf_false; + return child; +} - return ret; +static int32_t +afr_least_latency_child(afr_private_t *priv, unsigned char *readable) +{ + int32_t i = 0; + int child = -1; + + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] || + priv->child_latency[i] < 0) + continue; + + if (child == -1 || + priv->child_latency[i] < priv->child_latency[child]) { + child = i; + } + } + return child; } +static int32_t +afr_least_latency_times_pending_reads_child(afr_private_t *priv, + unsigned char *readable) +{ + int32_t i = 0; + int child = -1; + int64_t pending_read = 0; + int64_t latency = -1; + int64_t least_latency = -1; + + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] || + priv->child_latency[i] < 0) + continue; + + pending_read = GF_ATOMIC_GET(priv->pending_reads[i]); + latency = (pending_read + 1) * priv->child_latency[i]; + + if (child == -1 || latency < least_latency) { + least_latency = latency; + child = i; + } + } + return child; +} int -afr_fresh_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv, + unsigned char *readable) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - struct iatt * lookup_buf = NULL; - - int call_count = -1; - int child_index = -1; - int first_up_child = -1; - - child_index = (long) cookie; - priv = this->private; + uuid_t gfid_copy = { + 0, + }; + pid_t pid; + int child = -1; + + switch (priv->hash_mode) { + case AFR_READ_POLICY_FIRST_UP: + break; + case AFR_READ_POLICY_GFID_HASH: + gf_uuid_copy(gfid_copy, args->gfid); + child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) % + priv->child_count; + break; + case AFR_READ_POLICY_GFID_PID_HASH: + if (args->ia_type != IA_IFDIR) { + /* + * Why getpid? Because it's one of the cheapest calls + * available - faster than gethostname etc. - and + * returns a constant-length value that's sure to be + * shorter than a UUID. It's still very unlikely to be + * the same across clients, so it still provides good + * mixing. We're not trying for perfection here. All we + * need is a low probability that multiple clients + * won't converge on the same subvolume. + */ + gf_uuid_copy(gfid_copy, args->gfid); + pid = getpid(); + *(pid_t *)gfid_copy ^= pid; + } + child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) % + priv->child_count; + break; + case AFR_READ_POLICY_LESS_LOAD: + child = afr_least_pending_reads_child(priv, readable); + break; + case AFR_READ_POLICY_LEAST_LATENCY: + child = afr_least_latency_child(priv, readable); + break; + case AFR_READ_POLICY_LOAD_LATENCY_HYBRID: + child = afr_least_latency_times_pending_reads_child(priv, readable); + break; + } + + return child; +} - LOCK (&frame->lock); - { - local = frame->local; +int +afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this, + unsigned char *readable, + afr_read_subvol_args_t *args) +{ + int i = 0; + int read_subvol = -1; + afr_private_t *priv = NULL; + afr_read_subvol_args_t local_args = { + 0, + }; + + priv = this->private; + + /* first preference - explicitly specified or local subvolume */ + if (priv->read_child >= 0 && readable[priv->read_child]) + return priv->read_child; + + if (inode_is_linked(inode)) { + gf_uuid_copy(local_args.gfid, inode->gfid); + local_args.ia_type = inode->ia_type; + } else if (args) { + local_args = *args; + } + + /* second preference - use hashed mode */ + read_subvol = afr_hash_child(&local_args, priv, readable); + if (read_subvol >= 0 && readable[read_subvol]) + return read_subvol; + + for (i = 0; i < priv->child_count; i++) { + if (readable[i]) + return i; + } + + /* no readable subvolumes, either split brain or all subvols down */ + + return -1; +} - lookup_buf = &local->cont.lookup.buf; +int +afr_inode_read_subvol_type_get(inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, int type) +{ + int ret = -1; - if (op_ret == -1) { - if (op_errno == ENOENT) - local->enoent_count++; + if (type == AFR_METADATA_TRANSACTION) + ret = afr_inode_read_subvol_get(inode, this, 0, readable, event_p); + else + ret = afr_inode_read_subvol_get(inode, this, readable, 0, event_p); + return ret; +} - if (__error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; +void +afr_readables_intersect_get(inode_t *inode, xlator_t *this, int *event, + unsigned char *intersection) +{ + afr_private_t *priv = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + unsigned char *intersect = NULL; + + priv = this->private; + data_readable = alloca0(priv->child_count); + metadata_readable = alloca0(priv->child_count); + intersect = alloca0(priv->child_count); + + afr_inode_read_subvol_get(inode, this, data_readable, metadata_readable, + event); + + AFR_INTERSECT(intersect, data_readable, metadata_readable, + priv->child_count); + if (intersection) + memcpy(intersection, intersect, + sizeof(*intersection) * priv->child_count); +} - if (local->op_errno == ESTALE) { - local->op_ret = -1; - } +int +afr_read_subvol_get(inode_t *inode, xlator_t *this, int *subvol_p, + unsigned char *readables, int *event_p, + afr_transaction_type type, afr_read_subvol_args_t *args) +{ + afr_private_t *priv = NULL; + unsigned char *readable = NULL; + unsigned char *intersection = NULL; + int subvol = -1; + int event = 0; + + priv = this->private; + + readable = alloca0(priv->child_count); + intersection = alloca0(priv->child_count); + + afr_inode_read_subvol_type_get(inode, this, readable, &event, type); + + afr_readables_intersect_get(inode, this, &event, intersection); + + if (AFR_COUNT(intersection, priv->child_count) > 0) + subvol = afr_read_subvol_select_by_policy(inode, this, intersection, + args); + else + subvol = afr_read_subvol_select_by_policy(inode, this, readable, args); + if (subvol_p) + *subvol_p = subvol; + if (event_p) + *event_p = event; + if (readables) + memcpy(readables, readable, sizeof(*readables) * priv->child_count); + return subvol; +} - goto unlock; - } +void +afr_local_transaction_cleanup(afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = 0; - afr_lookup_collect_xattr (local, this, child_index, xattr); + priv = this->private; - first_up_child = afr_first_up_child (priv); + afr_matrix_cleanup(local->pending, priv->child_count); - if (child_index == first_up_child) { - local->cont.lookup.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - first_up_child); - } + GF_FREE(local->internal_lock.lower_locked_nodes); - if (local->success_count == 0) { - if (local->op_errno != ESTALE) - local->op_ret = op_ret; - - local->cont.lookup.inode = inode_ref (inode); - local->cont.lookup.xattr = dict_ref (xattr); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparent = *postparent; - - if (priv->first_lookup && inode->ino == 1) { - gf_log (this->name, GF_LOG_NORMAL, - "added root inode"); - priv->root_inode = inode_ref (inode); - priv->first_lookup = 0; - } - - *lookup_buf = *buf; - - lookup_buf->ia_ino = afr_itransform (buf->ia_ino, - priv->child_count, - child_index); - if (priv->read_child >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - priv->read_child); - } else { - afr_set_read_child (this, - local->cont.lookup.inode, - child_index); - } - - } else { - afr_lookup_self_heal_check (this, local, buf, lookup_buf); - - if (child_index == local->read_child_index) { - /* - lookup has succeeded on the read child. - So use its inode number - */ - if (local->cont.lookup.xattr) - dict_unref (local->cont.lookup.xattr); - - local->cont.lookup.xattr = dict_ref (xattr); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparent = *postparent; - - *lookup_buf = *buf; - } + afr_lockees_cleanup(&local->internal_lock); - } + GF_FREE(local->transaction.pre_op); - local->success_count++; + GF_FREE(local->transaction.pre_op_sources); + if (local->transaction.changelog_xdata) { + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.changelog_xdata[i]) + continue; + dict_unref(local->transaction.changelog_xdata[i]); } -unlock: - UNLOCK (&frame->lock); + GF_FREE(local->transaction.changelog_xdata); + } - call_count = afr_frame_return (frame); + GF_FREE(local->transaction.failed_subvols); - if (call_count == 0) { - afr_lookup_done (frame, this, lookup_buf); - } + GF_FREE(local->transaction.basename); + GF_FREE(local->transaction.new_basename); - return 0; + loc_wipe(&local->transaction.parent_loc); + loc_wipe(&local->transaction.new_parent_loc); } +void +afr_reply_wipe(struct afr_reply *reply) +{ + if (reply->xdata) { + dict_unref(reply->xdata); + reply->xdata = NULL; + } + + if (reply->xattr) { + dict_unref(reply->xattr); + reply->xattr = NULL; + } +} -int -afr_revalidate_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +void +afr_replies_wipe(struct afr_reply *replies, int count) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - struct iatt * lookup_buf = NULL; + int i = 0; - int call_count = -1; - int child_index = -1; - int first_up_child = -1; + for (i = 0; i < count; i++) { + afr_reply_wipe(&replies[i]); + } +} - child_index = (long) cookie; - priv = this->private; +void +afr_local_replies_wipe(afr_local_t *local, afr_private_t *priv) +{ + if (!local->replies) + return; - LOCK (&frame->lock); - { - local = frame->local; + afr_replies_wipe(local->replies, priv->child_count); - lookup_buf = &local->cont.lookup.buf; + memset(local->replies, 0, sizeof(*local->replies) * priv->child_count); +} - if (op_ret == -1) { - if (op_errno == ENOENT) - local->enoent_count++; +static gf_boolean_t +afr_fop_lock_is_unlock(call_frame_t *frame) +{ + afr_local_t *local = frame->local; + switch (local->op) { + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + if ((F_UNLCK == local->cont.inodelk.in_flock.l_type) && + (local->cont.inodelk.in_cmd == F_SETLKW || + local->cont.inodelk.in_cmd == F_SETLK)) + return _gf_true; + break; + case GF_FOP_ENTRYLK: + case GF_FOP_FENTRYLK: + if (ENTRYLK_UNLOCK == local->cont.entrylk.in_cmd) + return _gf_true; + break; + default: + return _gf_false; + } + return _gf_false; +} - if (__error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; +static gf_boolean_t +afr_lk_is_unlock(int32_t cmd, struct gf_flock *flock) +{ + switch (cmd) { + case F_RESLK_UNLCK: + return _gf_true; + break; - if (local->op_errno == ESTALE) { - local->op_ret = -1; - } +#if F_SETLKW != F_SETLKW64 + case F_SETLKW64: +#endif + case F_SETLKW: - goto unlock; - } +#if F_SETLK != F_SETLK64 + case F_SETLK64: +#endif + case F_SETLK: + if (F_UNLCK == flock->l_type) + return _gf_true; + break; + default: + return _gf_false; + } + return _gf_false; +} - afr_lookup_collect_xattr (local, this, child_index, xattr); +void +afr_handle_inconsistent_fop(call_frame_t *frame, int32_t *op_ret, + int32_t *op_errno) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - first_up_child = afr_first_up_child (priv); + if (!frame || !frame->this || !frame->local || !frame->this->private) + return; - if (child_index == first_up_child) { - local->cont.lookup.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - first_up_child); - } + if (*op_ret < 0) + return; - /* in case of revalidate, we need to send stat of the - * child whose stat was sent during the first lookup. - * (so that time stamp does not vary with revalidate. - * in case it is down, stat of the fist success will - * be replied */ + /* Failing inodelk/entrylk/lk here is not a good idea because we + * need to cleanup the locks on the other bricks if we choose to fail + * the fop here. The brick may go down just after unwind happens as well + * so anyways the fop will fail when the next fop is sent so leaving + * it like this for now.*/ + local = frame->local; + switch (local->op) { + case GF_FOP_LOOKUP: + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + case GF_FOP_ENTRYLK: + case GF_FOP_FENTRYLK: + case GF_FOP_LK: + return; + default: + break; + } - /* inode number should be preserved across revalidates */ + priv = frame->this->private; + if (!priv->consistent_io) + return; - if (local->success_count == 0) { - if (local->op_errno != ESTALE) - local->op_ret = op_ret; + if (local->event_generation && + (local->event_generation != priv->event_generation)) + goto inconsistent; - local->cont.lookup.inode = inode_ref (inode); - local->cont.lookup.xattr = dict_ref (xattr); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparent = *postparent; + return; +inconsistent: + *op_ret = -1; + *op_errno = ENOTCONN; +} - *lookup_buf = *buf; +void +afr_local_cleanup(afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = NULL; - lookup_buf->ia_ino = afr_itransform (buf->ia_ino, - priv->child_count, - child_index); + if (!local) + return; - if (priv->read_child >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - priv->read_child); - } else { - afr_set_read_child (this, - local->cont.lookup.inode, - child_index); - } + syncbarrier_destroy(&local->barrier); - } else { - afr_lookup_self_heal_check (this, local, buf, lookup_buf); + afr_local_transaction_cleanup(local, this); - if (child_index == local->read_child_index) { + priv = this->private; - /* - lookup has succeeded on the read child. - So use its inode number - */ + loc_wipe(&local->loc); + loc_wipe(&local->newloc); - if (local->cont.lookup.xattr) - dict_unref (local->cont.lookup.xattr); + if (local->fd) + fd_unref(local->fd); - local->cont.lookup.xattr = dict_ref (xattr); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparent = *postparent; + if (local->xattr_req) + dict_unref(local->xattr_req); - *lookup_buf = *buf; - } + if (local->xattr_rsp) + dict_unref(local->xattr_rsp); - } + if (local->dict) + dict_unref(local->dict); - local->success_count++; - } -unlock: - UNLOCK (&frame->lock); + afr_local_replies_wipe(local, priv); + GF_FREE(local->replies); - call_count = afr_frame_return (frame); + GF_FREE(local->child_up); - if (call_count == 0) { - afr_lookup_done (frame, this, lookup_buf); - } + GF_FREE(local->read_attempted); + + GF_FREE(local->readable); + GF_FREE(local->readable2); + + if (local->inode) + inode_unref(local->inode); - return 0; + if (local->parent) + inode_unref(local->parent); + + if (local->parent2) + inode_unref(local->parent2); + + if (local->refreshinode) + inode_unref(local->refreshinode); + + { /* getxattr */ + GF_FREE(local->cont.getxattr.name); + } + + { /* lk */ + GF_FREE(local->cont.lk.locked_nodes); + GF_FREE(local->cont.lk.dom_locked_nodes); + GF_FREE(local->cont.lk.dom_lock_op_ret); + GF_FREE(local->cont.lk.dom_lock_op_errno); + } + + { /* create */ + if (local->cont.create.fd) + fd_unref(local->cont.create.fd); + if (local->cont.create.params) + dict_unref(local->cont.create.params); + } + + { /* mknod */ + if (local->cont.mknod.params) + dict_unref(local->cont.mknod.params); + } + + { /* mkdir */ + if (local->cont.mkdir.params) + dict_unref(local->cont.mkdir.params); + } + + { /* symlink */ + if (local->cont.symlink.params) + dict_unref(local->cont.symlink.params); + } + + { /* writev */ + GF_FREE(local->cont.writev.vector); + if (local->cont.writev.iobref) + iobref_unref(local->cont.writev.iobref); + } + + { /* setxattr */ + if (local->cont.setxattr.dict) + dict_unref(local->cont.setxattr.dict); + } + + { /* fsetxattr */ + if (local->cont.fsetxattr.dict) + dict_unref(local->cont.fsetxattr.dict); + } + + { /* removexattr */ + GF_FREE(local->cont.removexattr.name); + } + { /* xattrop */ + if (local->cont.xattrop.xattr) + dict_unref(local->cont.xattrop.xattr); + } + { /* symlink */ + GF_FREE(local->cont.symlink.linkpath); + } + + { /* opendir */ + GF_FREE(local->cont.opendir.checksum); + } + + { /* open */ + if (local->cont.open.fd) + fd_unref(local->cont.open.fd); + } + + { /* readdirp */ + if (local->cont.readdir.dict) + dict_unref(local->cont.readdir.dict); + } + + { /* inodelk */ + GF_FREE(local->cont.inodelk.volume); + if (local->cont.inodelk.xdata) + dict_unref(local->cont.inodelk.xdata); + } + + { /* entrylk */ + GF_FREE(local->cont.entrylk.volume); + GF_FREE(local->cont.entrylk.basename); + if (local->cont.entrylk.xdata) + dict_unref(local->cont.entrylk.xdata); + } + + if (local->xdata_req) + dict_unref(local->xdata_req); + + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); } - int -afr_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) +afr_frame_return(call_frame_t *frame) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int ret = -1; - int i = 0; + afr_local_t *local = NULL; + int call_count = 0; - fop_lookup_cbk_t callback; + local = frame->local; - int call_count = 0; + LOCK(&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK(&frame->lock); - uint64_t ctx; + return call_count; +} - int32_t op_errno = 0; +static char *afr_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL}; - priv = this->private; +gf_boolean_t +afr_is_xattr_ignorable(char *key) +{ + int i = 0; + + if (!strncmp(key, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) + return _gf_true; + for (i = 0; afr_ignore_xattrs[i]; i++) { + if (!strcmp(key, afr_ignore_xattrs[i])) + return _gf_true; + } + return _gf_false; +} - ALLOC_OR_GOTO (local, afr_local_t, out); +static gf_boolean_t +afr_xattr_match_needed(dict_t *this, char *key1, data_t *value1, void *data) +{ + /* Ignore all non-disk (i.e. virtual) xattrs right away. */ + if (!gf_is_valid_xattr_namespace(key1)) + return _gf_false; - local->op_ret = -1; + /* Ignore on-disk xattrs that AFR doesn't need to heal. */ + if (!afr_is_xattr_ignorable(key1)) + return _gf_true; - frame->local = local; + return _gf_false; +} - if (!strcmp (loc->path, "/" GF_REPLICATE_TRASH_DIR)) { - op_errno = ENOENT; - goto out; - } +gf_boolean_t +afr_xattrs_are_equal(dict_t *dict1, dict_t *dict2) +{ + return are_dicts_equal(dict1, dict2, afr_xattr_match_needed, NULL); +} + +static int +afr_get_parent_read_subvol(xlator_t *this, inode_t *parent, + struct afr_reply *replies, unsigned char *readable) +{ + int i = 0; + int par_read_subvol = -1; + int par_read_subvol_iter = -1; + afr_private_t *priv = NULL; - loc_copy (&local->loc, loc); + priv = this->private; - ret = inode_ctx_get (loc->inode, this, &ctx); - if (ret == 0) { - /* lookup is a revalidate */ + if (parent) + par_read_subvol = afr_data_subvol_get(parent, this, NULL, NULL, NULL, + NULL); - callback = afr_revalidate_lookup_cbk; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; - local->cont.lookup.is_revalidate = _gf_true; - local->read_child_index = afr_read_child (this, - loc->inode); - } else { - callback = afr_fresh_lookup_cbk; + if (replies[i].op_ret < 0) + continue; - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + if (par_read_subvol_iter == -1) { + par_read_subvol_iter = i; + continue; } - if (loc->parent) - local->cont.lookup.parent_ino = loc->parent->ino; + if ((par_read_subvol_iter != par_read_subvol) && readable[i]) + par_read_subvol_iter = i; + + if (i == par_read_subvol) + par_read_subvol_iter = i; + } + /* At the end of the for-loop, the only reason why @par_read_subvol_iter + * could be -1 is when this LOOKUP has failed on all sub-volumes. + * So it is okay to send an arbitrary subvolume (0 in this case) + * as parent read subvol. + */ + if (par_read_subvol_iter == -1) + par_read_subvol_iter = 0; + + return par_read_subvol_iter; +} - local->child_up = memdup (priv->child_up, priv->child_count); +int +afr_read_subvol_decide(inode_t *inode, xlator_t *this, + afr_read_subvol_args_t *args, unsigned char *readable) +{ + int event = 0; + afr_private_t *priv = NULL; + unsigned char *intersection = NULL; - local->cont.lookup.xattrs = GF_CALLOC (priv->child_count, - sizeof (*local->cont.lookup.xattr), - gf_afr_mt_dict_t); + priv = this->private; + intersection = alloca0(priv->child_count); - local->call_count = afr_up_children_count (priv->child_count, - local->child_up); - call_count = local->call_count; + afr_readables_intersect_get(inode, this, &event, intersection); - if (local->call_count == 0) { - ret = -1; - op_errno = ENOTCONN; - goto out; - } + if (AFR_COUNT(intersection, priv->child_count) <= 0) { + /* TODO: If we have one brick with valid data_readable and + * another with metadata_readable, try to send an iatt with + * valid bits from both.*/ + return -1; + } - /* By default assume ENOTCONN. On success it will be set to 0. */ - local->op_errno = ENOTCONN; + memcpy(readable, intersection, sizeof(*readable) * priv->child_count); - if (xattr_req == NULL) - local->xattr_req = dict_new (); - else - local->xattr_req = dict_ref (xattr_req); + return afr_read_subvol_select_by_policy(inode, this, intersection, args); +} - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (local->xattr_req, priv->pending_key[i], - 3 * sizeof(int32_t)); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - /* 3 = data+metadata+entry */ - } +static inline int +afr_first_up_child(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - } + local = frame->local; + priv = this->private; - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - } + for (i = 0; i < priv->child_count; i++) + if (local->replies[i].valid && local->replies[i].op_ret == 0) + return i; + return -1; +} - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, callback, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - loc, local->xattr_req); - if (!--call_count) - break; - } - } +static void +afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this, + unsigned char *success_replies, + unsigned char *data_readable, int *read_subvol) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int spb_subvol = -1; + int child_count = -1; - ret = 0; -out: - if (ret == -1) - AFR_STACK_UNWIND (lookup, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + if (*read_subvol != -1) + return; - return 0; + priv = this->private; + local = frame->local; + child_count = priv->child_count; + + afr_split_brain_read_subvol_get(local->inode, this, frame, &spb_subvol); + if ((spb_subvol >= 0) && + (AFR_COUNT(success_replies, child_count) == child_count)) { + *read_subvol = spb_subvol; + } else if (!priv->quorum_count || + frame->root->pid == GF_CLIENT_PID_GLFS_HEAL) { + *read_subvol = afr_first_up_child(frame, this); + } else if (priv->quorum_count && + afr_has_quorum(data_readable, this, NULL)) { + /* read_subvol is guaranteed to be valid if we hit this path. */ + *read_subvol = afr_first_up_child(frame, this); + } else { + /* If quorum is enabled and we do not have a + readable yet, it means all good copies are down. + */ + local->op_ret = -1; + local->op_errno = ENOTCONN; + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_READ_SUBVOL_ERROR, + "no read " + "subvols for %s", + local->loc.path); + } + if (*read_subvol >= 0) + dict_del_sizen(local->replies[*read_subvol].xdata, GF_CONTENT_KEY); } - -/* {{{ open */ - -int -afr_fd_ctx_set (xlator_t *this, fd_t *fd) +static void +afr_lookup_done(call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = -1; + int op_errno = 0; + int read_subvol = 0; + int par_read_subvol = 0; + int ret = -1; + unsigned char *readable = NULL; + unsigned char *success_replies = NULL; + int event = 0; + struct afr_reply *replies = NULL; + uuid_t read_gfid = { + 0, + }; + gf_boolean_t locked_entry = _gf_false; + gf_boolean_t in_flight_create = _gf_false; + gf_boolean_t can_interpret = _gf_true; + inode_t *parent = NULL; + ia_type_t ia_type = IA_INVAL; + afr_read_subvol_args_t args = { + 0, + }; + char *gfid_heal_msg = NULL; + + priv = this->private; + local = frame->local; + replies = local->replies; + parent = local->loc.parent; + + locked_entry = afr_is_possibly_under_txn(AFR_ENTRY_TRANSACTION, local, + this); + + readable = alloca0(priv->child_count); + success_replies = alloca0(priv->child_count); + + afr_inode_read_subvol_get(parent, this, readable, NULL, &event); + par_read_subvol = afr_get_parent_read_subvol(this, parent, replies, + readable); + + /* First, check if we have a gfid-change from somewhere, + If so, propagate that so that a fresh lookup can be + issued + */ + if (local->cont.lookup.needs_fresh_lookup) { + local->op_ret = -1; + local->op_errno = ESTALE; + goto error; + } - int ret = -1; + op_errno = afr_final_errno(frame->local, this->private); + local->op_errno = op_errno; - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; + read_subvol = -1; + afr_fill_success_replies(local, priv, success_replies); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (fd, out); + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; - priv = this->private; + if (replies[i].op_ret == -1) { + if (locked_entry && replies[i].op_errno == ENOENT) { + in_flight_create = _gf_true; + } + continue; + } - LOCK (&fd->lock); - { - ret = __fd_ctx_get (fd, this, &ctx); + if (read_subvol == -1 || !readable[read_subvol]) { + read_subvol = i; + gf_uuid_copy(read_gfid, replies[i].poststat.ia_gfid); + ia_type = replies[i].poststat.ia_type; + local->op_ret = 0; + } + } - if (ret == 0) - goto unlock; + if (in_flight_create && !afr_has_quorum(success_replies, this, NULL)) { + local->op_ret = -1; + local->op_errno = ENOENT; + goto error; + } + + if (read_subvol == -1) + goto error; + /* We now have a read_subvol, which is readable[] (if there + were any). Next we look for GFID mismatches. We don't + consider a GFID mismatch as an error if read_subvol is + readable[] but the mismatching GFID subvol is not. + */ + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) { + continue; + } - fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), - gf_afr_mt_afr_fd_ctx_t); - if (!fd_ctx) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); + if (!gf_uuid_compare(replies[i].poststat.ia_gfid, read_gfid)) + continue; - ret = -ENOMEM; - goto unlock; - } + can_interpret = _gf_false; - fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_done) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - ret = -ENOMEM; - goto unlock; - } + if (locked_entry) + continue; - fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_piggyback) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - ret = -ENOMEM; - goto unlock; - } + /* Now GFIDs mismatch. It's OK as long as this subvol + is not readable[] but read_subvol is */ + if (readable[read_subvol] && !readable[i]) + continue; - fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->opened_on) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - ret = -ENOMEM; - goto unlock; - } + /* If we were called from glfsheal and there is still a gfid + * mismatch, succeed the lookup and let glfsheal print the + * response via gfid-heal-msg.*/ + if (!dict_get_str_sizen(local->xattr_req, "gfid-heal-msg", + &gfid_heal_msg)) + goto cant_interpret; - fd_ctx->up_count = priv->up_count; - fd_ctx->down_count = priv->down_count; - - fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->locked_on) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - ret = -ENOMEM; - goto unlock; - } + /* LOG ERROR */ + local->op_ret = -1; + local->op_errno = EIO; + goto error; + } + + /* Forth, for the finalized GFID, pick the best subvolume + to return stats from. + */ + read_subvol = -1; + memset(readable, 0, sizeof(*readable) * priv->child_count); + if (can_interpret) { + if (!afr_has_quorum(success_replies, this, NULL)) + goto cant_interpret; + /* It is safe to call afr_replies_interpret() because we have + a response from all the UP subvolumes and all of them resolved + to the same GFID + */ + gf_uuid_copy(args.gfid, read_gfid); + args.ia_type = ia_type; + ret = afr_replies_interpret(frame, this, local->inode, NULL); + read_subvol = afr_read_subvol_decide(local->inode, this, &args, + readable); + if (read_subvol == -1) + goto cant_interpret; + if (ret) { + afr_inode_need_refresh_set(local->inode, this); + dict_del_sizen(local->replies[read_subvol].xdata, GF_CONTENT_KEY); + } + } else { + cant_interpret: + afr_attempt_readsubvol_set(frame, this, success_replies, readable, + &read_subvol); + if (read_subvol == -1) { + goto error; + } + } - ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); + afr_handle_quota_size(frame, this); - INIT_LIST_HEAD (&fd_ctx->entries); + afr_set_need_heal(this, local); + if (AFR_IS_ARBITER_BRICK(priv, read_subvol) && local->op_ret == 0) { + local->op_ret = -1; + local->op_errno = ENOTCONN; + gf_msg_debug(this->name, 0, + "Arbiter cannot be a read subvol " + "for %s", + local->loc.path); + goto error; + } + + ret = dict_get_str_sizen(local->xattr_req, "gfid-heal-msg", &gfid_heal_msg); + if (!ret) { + ret = dict_set_str_sizen(local->replies[read_subvol].xdata, + "gfid-heal-msg", gfid_heal_msg); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED, + "Error setting gfid-heal-msg dict"); + local->op_ret = -1; + local->op_errno = ENOMEM; } -unlock: - UNLOCK (&fd->lock); -out: - return ret; + } + + AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[par_read_subvol].postparent); + return; + +error: + AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, NULL, NULL, + NULL, NULL); } -/* {{{ flush */ +/* + * During a lookup, some errors are more "important" than + * others in that they must be given higher priority while + * returning to the user. + * + * The hierarchy is ENODATA > ENOENT > ESTALE > ENOSPC others + */ int -afr_flush_unwind (call_frame_t *frame, xlator_t *this) +afr_higher_errno(int32_t old_errno, int32_t new_errno) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + if (old_errno == ENODATA || new_errno == ENODATA) + return ENODATA; + if (old_errno == ENOENT || new_errno == ENOENT) + return ENOENT; + if (old_errno == ESTALE || new_errno == ESTALE) + return ESTALE; + if (old_errno == ENOSPC || new_errno == ENOSPC) + return ENOSPC; + + return new_errno; +} - local = frame->local; +int +afr_final_errno(afr_local_t *local, afr_private_t *priv) +{ + int i = 0; + int op_errno = 0; + int tmp_errno = 0; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret >= 0) + continue; + tmp_errno = local->replies[i].op_errno; + op_errno = afr_higher_errno(op_errno, tmp_errno); + } + + return op_errno; +} - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); +static int32_t +afr_local_discovery_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + int ret = 0; + char *pathinfo = NULL; + gf_boolean_t is_local = _gf_false; + afr_private_t *priv = NULL; + int32_t child_index = -1; + + if (op_ret != 0) { + goto out; + } + + priv = this->private; + child_index = (int32_t)(long)cookie; + + ret = dict_get_str_sizen(dict, GF_XATTR_PATHINFO_KEY, &pathinfo); + if (ret != 0) { + goto out; + } + + ret = glusterfs_is_local_pathinfo(pathinfo, &is_local); + if (ret) { + goto out; + } + + /* + * Note that one local subvolume will override another here. The only + * way to avoid that would be to retain extra information about whether + * the previous read_child is local, and it's just not worth it. Even + * the slowest local subvolume is far preferable to a remote one. + */ + if (is_local) { + priv->local[child_index] = 1; + /* Don't set arbiter as read child. */ + if (AFR_IS_ARBITER_BRICK(priv, child_index)) + goto out; + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_LOCAL_CHILD, + "selecting local read_child %s", + priv->children[child_index]->name); + + priv->read_child = child_index; + } +out: + STACK_DESTROY(frame->root); + return 0; +} - if (main_frame) { - AFR_STACK_UNWIND (flush, main_frame, - local->op_ret, local->op_errno); - } +static void +afr_attempt_local_discovery(xlator_t *this, int32_t child_index) +{ + call_frame_t *newframe = NULL; + loc_t tmploc = { + 0, + }; + afr_private_t *priv = this->private; + + newframe = create_frame(this, this->ctx->pool); + if (!newframe) { + return; + } - return 0; + tmploc.gfid[sizeof(tmploc.gfid) - 1] = 1; + STACK_WIND_COOKIE(newframe, afr_local_discovery_cbk, + (void *)(long)child_index, priv->children[child_index], + priv->children[child_index]->fops->getxattr, &tmploc, + GF_XATTR_PATHINFO_KEY, NULL); } - int -afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_lookup_sh_metadata_wrap(void *opaque) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + inode_t *inode = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int i = 0, first = -1; + int ret = -1; + dict_t *dict = NULL; + + local = frame->local; + this = frame->this; + priv = this->private; + replies = local->replies; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + first = i; + break; + } + if (first == -1) + goto out; + + if (afr_selfheal_metadata_by_stbuf(this, &replies[first].poststat)) + goto out; + + afr_local_replies_wipe(local, this->private); + + dict = dict_new(); + if (!dict) + goto out; + if (local->xattr_req) { + dict_copy(local->xattr_req, dict); + } + + ret = dict_set_sizen_str_sizen(dict, "link-count", GF_XATTROP_INDEX_COUNT); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to set link-count in dict "); + } + + if (loc_is_nameless(&local->loc)) { + ret = afr_selfheal_unlocked_discover_on(frame, local->inode, + local->loc.gfid, local->replies, + local->child_up, dict); + } else { + inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent, + local->loc.name, local->replies, + local->child_up, dict); + } + if (inode) + inode_unref(inode); +out: + if (loc_is_nameless(&local->loc)) + afr_discover_done(frame, this); + else + afr_lookup_done(frame, this); - int call_count = -1; - int child_index = (long) cookie; - int need_unwind = 0; + if (dict) + dict_unref(dict); - local = frame->local; - priv = this->private; + return 0; +} - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } +gf_boolean_t +afr_is_pending_set(xlator_t *this, dict_t *xdata, int type) +{ + int idx = -1; + afr_private_t *priv = NULL; + void *pending_raw = NULL; + int *pending_int = NULL; + int i = 0; + + priv = this->private; + idx = afr_index_for_transaction_type(type); + + if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw) == 0) { + if (pending_raw) { + pending_int = pending_raw; - local->op_errno = op_errno; + if (ntoh32(pending_int[idx])) + return _gf_true; } - UNLOCK (&frame->lock); + } + + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw)) + continue; + if (!pending_raw) + continue; + pending_int = pending_raw; + + if (ntoh32(pending_int[idx])) + return _gf_true; + } - if (need_unwind) - afr_flush_unwind (frame, this); + return _gf_false; +} - call_count = afr_frame_return (frame); +static gf_boolean_t +afr_can_start_metadata_self_heal(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int i = 0, first = -1; + gf_boolean_t start = _gf_false; + struct iatt stbuf = { + 0, + }; + + local = frame->local; + replies = local->replies; + priv = this->private; + + if (!priv->metadata_self_heal) + return _gf_false; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (first == -1) { + first = i; + stbuf = replies[i].poststat; + continue; + } - if (call_count == 0) { - local->transaction.resume (frame, this); + if (afr_is_pending_set(this, replies[i].xdata, + AFR_METADATA_TRANSACTION)) { + /* Let shd do the heal so that lookup is not blocked + * on getting metadata lock/doing the heal */ + start = _gf_false; + break; } - return 0; + if (gf_uuid_compare(stbuf.ia_gfid, replies[i].poststat.ia_gfid)) { + start = _gf_false; + break; + } + if (!IA_EQUAL(stbuf, replies[i].poststat, type)) { + start = _gf_false; + break; + } + + /*Check if iattrs need heal*/ + if ((!IA_EQUAL(stbuf, replies[i].poststat, uid)) || + (!IA_EQUAL(stbuf, replies[i].poststat, gid)) || + (!IA_EQUAL(stbuf, replies[i].poststat, prot))) { + start = _gf_true; + continue; + } + + /*Check if xattrs need heal*/ + if (!afr_xattrs_are_equal(replies[first].xdata, replies[i].xdata)) + start = _gf_true; + } + + return start; } +int +afr_lookup_metadata_heal_check(call_frame_t *frame, xlator_t *this) + +{ + call_frame_t *heal = NULL; + afr_local_t *local = NULL; + int ret = 0; + + local = frame->local; + if (!afr_can_start_metadata_self_heal(frame, this)) + goto out; + + heal = afr_frame_create(this, &ret); + if (!heal) { + ret = -ret; + goto out; + } + + ret = synctask_new(this->ctx->env, afr_lookup_sh_metadata_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto out; + return ret; +out: + if (loc_is_nameless(&local->loc)) + afr_discover_done(frame, this); + else + afr_lookup_done(frame, this); + if (heal) + AFR_STACK_DESTROY(heal); + return ret; +} int -afr_flush_wind (call_frame_t *frame, xlator_t *this) +afr_lookup_selfheal_wrap(void *opaque) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + int ret = 0; + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + inode_t *inode = NULL; + uuid_t pargfid = { + 0, + }; + + local = frame->local; + this = frame->this; + loc_pargfid(&local->loc, pargfid); + + ret = afr_selfheal_name(frame->this, pargfid, local->loc.name, + &local->cont.lookup.gfid_req, local->xattr_req); + if (ret == -EIO) + goto unwind; + + afr_local_replies_wipe(local, this->private); + + inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent, + local->loc.name, local->replies, + local->child_up, local->xattr_req); + if (inode) + inode_unref(inode); + + afr_lookup_metadata_heal_check(frame, this); + return 0; - int i = 0; - int call_count = -1; +unwind: + AFR_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL); + return 0; +} - local = frame->local; - priv = this->private; +int +afr_lookup_entry_heal(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + call_frame_t *heal = NULL; + int i = 0, first = -1; + gf_boolean_t name_state_mismatch = _gf_false; + struct afr_reply *replies = NULL; + int ret = 0; + unsigned char *par_readables = NULL; + unsigned char *success = NULL; + int32_t op_errno = 0; + uuid_t gfid = {0}; + + local = frame->local; + replies = local->replies; + priv = this->private; + par_readables = alloca0(priv->child_count); + success = alloca0(priv->child_count); + + ret = afr_inode_read_subvol_get(local->loc.parent, this, par_readables, + NULL, NULL); + if (ret < 0 || AFR_COUNT(par_readables, priv->child_count) == 0) { + /* In this case set par_readables to all 1 so that name_heal + * need checks at the end of this function will flag missing + * entry when name state mismatches*/ + memset(par_readables, 1, priv->child_count); + } + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret == 0) { + if (gf_uuid_is_null(gfid)) { + gf_uuid_copy(gfid, replies[i].poststat.ia_gfid); + } + success[i] = 1; + } else { + if ((replies[i].op_errno != ENOTCONN) && + (replies[i].op_errno != ENOENT) && + (replies[i].op_errno != ESTALE)) { + op_errno = replies[i].op_errno; + } + } - call_count = afr_up_children_count (priv->child_count, local->child_up); + /*gfid is missing, needs heal*/ + if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA)) { + goto name_heal; + } - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; + if (first == -1) { + first = i; + continue; } - local->call_count = call_count; + if (replies[i].op_ret != replies[first].op_ret) { + name_state_mismatch = _gf_true; + } + if (replies[i].op_ret == 0) { + /* Rename after this lookup may succeed if we don't do + * a name-heal and the destination may not have pending xattrs + * to indicate which name is good and which is bad so always do + * this heal*/ + if (gf_uuid_compare(replies[i].poststat.ia_gfid, gfid)) { + goto name_heal; + } + } + } + + if (name_state_mismatch) { + if (!priv->quorum_count) + goto name_heal; + if (!afr_has_quorum(success, this, NULL)) + goto name_heal; + if (op_errno) + goto name_heal; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_flush_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - local->fd); - - if (!--call_count) - break; - } + if (!replies[i].valid) + continue; + if (par_readables[i] && replies[i].op_ret < 0 && + replies[i].op_errno != ENOTCONN) { + goto name_heal; + } } + } - return 0; -} - + goto metadata_heal; -int -afr_flush_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; +name_heal: + heal = afr_frame_create(this, NULL); + if (!heal) + goto metadata_heal; - local->transaction.unwind (frame, this); + ret = synctask_new(this->ctx->env, afr_lookup_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) { + AFR_STACK_DESTROY(heal); + goto metadata_heal; + } + return ret; - AFR_STACK_DESTROY (frame); +metadata_heal: + ret = afr_lookup_metadata_heal_check(frame, this); - return 0; + return ret; } - int -afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +afr_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; + afr_local_t *local = NULL; + int call_count = -1; + int child_index = -1; + GF_UNUSED int ret = 0; + int8_t need_heal = 1; + + child_index = (long)cookie; + + local = frame->local; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + /* + * On revalidate lookup if the gfid-changed, afr should unwind the fop + * with ESTALE so that a fresh lookup will be sent by the top xlator. + * So remember it. + */ + if (xdata && dict_get_sizen(xdata, "gfid-changed")) + local->cont.lookup.needs_fresh_lookup = _gf_true; + + if (xdata) { + ret = dict_get_int8(xdata, "link-count", &need_heal); + local->replies[child_index].need_heal = need_heal; + } else { + local->replies[child_index].need_heal = need_heal; + } + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref(xdata); + } + + call_count = afr_frame_return(frame); + if (call_count == 0) { + afr_set_need_heal(this, local); + afr_lookup_entry_heal(frame, this); + } + + return 0; +} - call_frame_t * transaction_frame = NULL; +static void +afr_discover_unwind(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int read_subvol = -1; + int ret = 0; + unsigned char *data_readable = NULL; + unsigned char *success_replies = NULL; + + priv = this->private; + local = frame->local; + data_readable = alloca0(priv->child_count); + success_replies = alloca0(priv->child_count); + + afr_fill_success_replies(local, priv, success_replies); + if (AFR_COUNT(success_replies, priv->child_count) > 0) + local->op_ret = 0; + + if (local->op_ret < 0) { + local->op_ret = -1; + local->op_errno = afr_final_errno(frame->local, this->private); + goto error; + } - int ret = -1; + if (!afr_has_quorum(success_replies, this, frame)) + goto unwind; - int op_ret = -1; - int op_errno = 0; + ret = afr_replies_interpret(frame, this, local->inode, NULL); + if (ret) { + afr_inode_need_refresh_set(local->inode, this); + } - int call_count = 0; + read_subvol = afr_read_subvol_decide(local->inode, this, NULL, + data_readable); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); +unwind: + afr_attempt_readsubvol_set(frame, this, success_replies, data_readable, + &read_subvol); + if (read_subvol == -1) + goto error; - priv = this->private; + if (AFR_IS_ARBITER_BRICK(priv, read_subvol) && local->op_ret == 0) { + local->op_ret = -1; + local->op_errno = ENOTCONN; + gf_msg_debug(this->name, 0, + "Arbiter cannot be a read subvol " + "for %s", + local->loc.path); + } + + AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); + return; + +error: + AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, NULL, NULL, + NULL, NULL); +} - ALLOC_OR_GOTO (local, afr_local_t, out); +static int +afr_ta_id_file_check(void *opaque) +{ + afr_private_t *priv = NULL; + xlator_t *this = NULL; + loc_t loc = { + 0, + }; + struct iatt stbuf = { + 0, + }; + dict_t *dict = NULL; + uuid_t gfid = { + 0, + }; + fd_t *fd = NULL; + int ret = 0; + + this = opaque; + priv = this->private; + + ret = afr_fill_ta_loc(this, &loc, _gf_false); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate thin-arbiter loc for: %s.", loc.name); + goto out; + } + + ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, &stbuf, + 0, 0, 0); + if (ret == 0) { + goto out; + } else if (ret == -ENOENT) { + fd = fd_create(loc.inode, getpid()); + if (!fd) + goto out; + dict = dict_new(); + if (!dict) + goto out; + gf_uuid_generate(gfid); + ret = dict_set_gfuuid(dict, "gfid-req", gfid, true); + ret = syncop_create(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, + O_RDWR, 0664, fd, &stbuf, dict, NULL); + } - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; +out: + if (ret == 0) { + gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid); + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to lookup/create thin-arbiter id file."); + } + if (dict) + dict_unref(dict); + if (fd) + fd_unref(fd); + loc_wipe(&loc); + + return 0; +} + +static int +afr_ta_id_file_check_cbk(int ret, call_frame_t *ta_frame, void *opaque) +{ + return 0; +} + +static void +afr_discover_done(call_frame_t *frame, xlator_t *this) +{ + int ret = 0; + afr_private_t *priv = NULL; + + priv = this->private; + if (!priv->thin_arbiter_count) + goto unwind; + if (!gf_uuid_is_null(priv->ta_gfid)) + goto unwind; + + ret = synctask_new(this->ctx->env, afr_ta_id_file_check, + afr_ta_id_file_check_cbk, NULL, this); + if (ret) + goto unwind; +unwind: + afr_discover_unwind(frame, this); +} + +int +afr_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + afr_local_t *local = NULL; + int call_count = -1; + int child_index = -1; + GF_UNUSED int ret = 0; + int8_t need_heal = 1; + + child_index = (long)cookie; + + local = frame->local; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref(xdata); + } + + if (local->do_discovery && (op_ret == 0)) + afr_attempt_local_discovery(this, child_index); + + if (xdata) { + ret = dict_get_int8(xdata, "link-count", &need_heal); + local->replies[child_index].need_heal = need_heal; + } else { + local->replies[child_index].need_heal = need_heal; + } + + call_count = afr_frame_return(frame); + if (call_count == 0) { + afr_set_need_heal(this, local); + afr_lookup_metadata_heal_check(frame, this); + } + + return 0; +} + +int +afr_discover_do(call_frame_t *frame, xlator_t *this, int err) +{ + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + priv = this->private; + + if (err) { + local->op_errno = err; + goto out; + } + + call_count = local->call_count = AFR_COUNT(local->child_up, + priv->child_count); + + ret = afr_lookup_xattr_req_prepare(local, this, local->xattr_req, + &local->loc); + if (ret) { + local->op_errno = -ret; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE( + frame, afr_discover_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->lookup, &local->loc, local->xattr_req); + if (!--call_count) + break; } + } - call_count = afr_up_children_count (priv->child_count, local->child_up); + return 0; +out: + AFR_STACK_UNWIND(lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; +} - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; +int +afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int event = 0; + + priv = this->private; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; + } + + if (__is_root_gfid(loc->inode->gfid)) { + if (!priv->root_inode) + priv->root_inode = inode_ref(loc->inode); + + if (priv->choose_local && !priv->did_discovery) { + /* Logic to detect which subvolumes of AFR are + local, in order to prefer them for reads + */ + local->do_discovery = _gf_true; + priv->did_discovery = _gf_true; } + } - transaction_frame->local = local; + local->op = GF_FOP_LOOKUP; - local->op = GF_FOP_FLUSH; + loc_copy(&local->loc, loc); - local->transaction.fop = afr_flush_wind; - local->transaction.done = afr_flush_done; - local->transaction.unwind = afr_flush_unwind; + local->inode = inode_ref(loc->inode); - local->fd = fd_ref (fd); + if (xattr_req) { + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_copy_with_ref(xattr_req, NULL); + if (!local->xattr_req) { + op_errno = ENOMEM; + goto out; + } + } - local->transaction.main_frame = frame; - local->transaction.start = 0; - local->transaction.len = 0; + if (gf_uuid_is_null(loc->inode->gfid)) { + afr_discover_do(frame, this, 0); + return 0; + } - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + afr_read_subvol_get(loc->inode, this, NULL, NULL, &event, + AFR_DATA_TRANSACTION, NULL); + afr_discover_do(frame, this, 0); - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; +} - AFR_STACK_UNWIND (flush, frame, op_ret, op_errno); +int +afr_lookup_do(call_frame_t *frame, xlator_t *this, int err) +{ + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + priv = this->private; + + if (err < 0) { + local->op_errno = err; + goto out; + } + + call_count = local->call_count = AFR_COUNT(local->child_up, + priv->child_count); + + ret = afr_lookup_xattr_req_prepare(local, this, local->xattr_req, + &local->loc); + if (ret) { + local->op_errno = -ret; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE( + frame, afr_lookup_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->lookup, &local->loc, local->xattr_req); + if (!--call_count) + break; } - - return 0; + } + return 0; +out: + AFR_STACK_UNWIND(lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; } -/* }}} */ - +/* + * afr_lookup() + * + * The goal here is to figure out what the element getting looked up is. + * i.e what is the GFID, inode type and a conservative estimate of the + * inode attributes are. + * + * As we lookup, operations may be underway on the entry name and the + * inode. In lookup() we are primarily concerned only with the entry + * operations. If the entry is getting unlinked or renamed, we detect + * what operation is underway by querying for on-going transactions and + * pending self-healing on the entry through xdata. + * + * If the entry is a file/dir, it may need self-heal and/or in a + * split-brain condition. Lookup is not the place to worry about these + * conditions. Outcast marking will naturally handle them in the read + * paths. + * + * Here is a brief goal of what we are trying to achieve: + * + * - LOOKUP on all subvolumes concurrently, querying on-going transaction + * and pending self-heal info from the servers. + * + * - If all servers reply the same inode type and GFID, the overall call + * MUST be a success. + * + * - If inode types or GFIDs mismatch, and there IS either an on-going + * transaction or pending self-heal, inspect what the nature of the + * transaction or pending heal is, and select the appropriate subvolume's + * reply as the winner. + * + * - If inode types or GFIDs mismatch, and there are no on-going transactions + * or pending self-heal on the entry name on any of the servers, fail the + * lookup with EIO. Something has gone wrong beyond reasonable action. + */ int -afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) +afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = 0; + afr_local_t *local = NULL; + int32_t op_errno = 0; + int event = 0; + int ret = 0; + + if (loc_is_nameless(loc)) { + if (xattr_req) + dict_del_sizen(xattr_req, "gfid-req"); + afr_discover(frame, this, loc, xattr_req); + return 0; + } - ret = fd_ctx_get (fd, this, &ctx); + if (afr_is_private_directory(this->private, loc->parent->gfid, loc->name, + frame->root->pid)) { + op_errno = EPERM; + goto out; + } - if (ret < 0) - goto out; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - if (fd_ctx) { - if (fd_ctx->pre_op_done) - GF_FREE (fd_ctx->pre_op_done); + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; + } - if (fd_ctx->opened_on) - GF_FREE (fd_ctx->opened_on); + local->op = GF_FOP_LOOKUP; - if (fd_ctx->locked_on) - GF_FREE (fd_ctx->locked_on); + loc_copy(&local->loc, loc); - if (fd_ctx->pre_op_piggyback) - GF_FREE (fd_ctx->pre_op_piggyback); + local->inode = inode_ref(loc->inode); - GF_FREE (fd_ctx); + if (xattr_req) { + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_copy_with_ref(xattr_req, NULL); + if (!local->xattr_req) { + op_errno = ENOMEM; + goto out; } + ret = dict_get_gfuuid(local->xattr_req, "gfid-req", + &local->cont.lookup.gfid_req); + if (ret == 0) { + dict_del_sizen(local->xattr_req, "gfid-req"); + } + } + + afr_read_subvol_get(loc->parent, this, NULL, NULL, &event, + AFR_DATA_TRANSACTION, NULL); + + afr_lookup_do(frame, this, 0); + return 0; out: - return 0; + AFR_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + + return 0; } +void +_afr_cleanup_fd_ctx(xlator_t *this, afr_fd_ctx_t *fd_ctx) +{ + afr_private_t *priv = this->private; + + if (fd_ctx->lk_heal_info) { + LOCK(&priv->lock); + { + list_del(&fd_ctx->lk_heal_info->pos); + } + afr_lk_heal_info_cleanup(fd_ctx->lk_heal_info); + fd_ctx->lk_heal_info = NULL; + } + GF_FREE(fd_ctx->opened_on); + GF_FREE(fd_ctx); + return; +} int -afr_release (xlator_t *this, fd_t *fd) +afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd) { - afr_locked_fd_t *locked_fd = NULL; - afr_locked_fd_t *tmp = NULL; - afr_private_t *priv = NULL; + uint64_t ctx = 0; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = 0; - priv = this->private; + ret = fd_ctx_get(fd, this, &ctx); + if (ret < 0) + goto out; - afr_cleanup_fd_ctx (this, fd); + fd_ctx = (afr_fd_ctx_t *)(long)ctx; - list_for_each_entry_safe (locked_fd, tmp, &priv->saved_fds, - list) { + if (fd_ctx) { + _afr_cleanup_fd_ctx(this, fd_ctx); + } - if (locked_fd->fd == fd) { - list_del_init (&locked_fd->list); - GF_FREE (locked_fd); - } +out: + return 0; +} - } +int +afr_release(xlator_t *this, fd_t *fd) +{ + afr_cleanup_fd_ctx(this, fd); - return 0; + return 0; } +afr_fd_ctx_t * +__afr_fd_ctx_get(fd_t *fd, xlator_t *this) +{ + uint64_t ctx = 0; + int ret = 0; + afr_fd_ctx_t *fd_ctx = NULL; -/* {{{ fsync */ + ret = __fd_ctx_get(fd, this, &ctx); -int -afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) -{ - afr_local_t *local = NULL; + if (ret < 0) { + ret = __afr_fd_ctx_set(this, fd); + if (ret < 0) + goto out; + + ret = __fd_ctx_get(fd, this, &ctx); + if (ret < 0) + goto out; + } - int call_count = -1; + fd_ctx = (afr_fd_ctx_t *)(long)ctx; +out: + return fd_ctx; +} - int child_index = (long) cookie; - int read_child = 0; +afr_fd_ctx_t * +afr_fd_ctx_get(fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; - local = frame->local; + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get(fd, this); + } + UNLOCK(&fd->lock); - read_child = afr_read_child (this, local->fd->inode); + return fd_ctx; +} - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } +int +__afr_fd_ctx_set(xlator_t *this, fd_t *fd) +{ + afr_private_t *priv = NULL; + int ret = -1; + uint64_t ctx = 0; + afr_fd_ctx_t *fd_ctx = NULL; + int i = 0; + + VALIDATE_OR_GOTO(this->private, out); + VALIDATE_OR_GOTO(fd, out); + + priv = this->private; + + ret = __fd_ctx_get(fd, this, &ctx); + + if (ret == 0) + goto out; + + fd_ctx = GF_CALLOC(1, sizeof(afr_fd_ctx_t), gf_afr_mt_afr_fd_ctx_t); + if (!fd_ctx) { + ret = -ENOMEM; + goto out; + } + + fd_ctx->opened_on = GF_CALLOC(sizeof(*fd_ctx->opened_on), priv->child_count, + gf_afr_mt_int32_t); + if (!fd_ctx->opened_on) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (fd_is_anonymous(fd)) + fd_ctx->opened_on[i] = AFR_FD_OPENED; + else + fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; + } - if (op_ret == 0) { - local->op_ret = 0; + fd_ctx->readdir_subvol = -1; + fd_ctx->lk_heal_info = NULL; - if (local->success_count == 0) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; - } + ret = __fd_ctx_set(fd, this, (uint64_t)(long)fd_ctx); + if (ret) + gf_msg_debug(this->name, 0, "failed to set fd ctx (%p)", fd); +out: + if (ret && fd_ctx) + _afr_cleanup_fd_ctx(this, fd_ctx); + return ret; +} - if (child_index == read_child) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; - } +/* {{{ flush */ - local->success_count++; - } +int +afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + int call_count = -1; + + local = frame->local; - local->op_errno = op_errno; + LOCK(&frame->lock); + { + if (op_ret != -1) { + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); + } else { + local->op_errno = op_errno; } - UNLOCK (&frame->lock); + call_count = --local->call_count; + } + UNLOCK(&frame->lock); - call_count = afr_frame_return (frame); + if (call_count == 0) + AFR_STACK_UNWIND(flush, frame, local->op_ret, local->op_errno, + local->xdata_rsp); - if (call_count == 0) { - local->cont.fsync.prebuf.ia_ino = local->cont.fsync.ino; - local->cont.fsync.postbuf.ia_ino = local->cont.fsync.ino; + return 0; +} - AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno, - &local->cont.fsync.prebuf, - &local->cont.fsync.postbuf); +static int +afr_flush_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + + priv = this->private; + local = frame->local; + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE(frame, afr_flush_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->flush, + local->fd, xdata); + if (!--call_count) + break; } + } - return 0; + return 0; +} + +afr_local_t * +afr_wakeup_same_fd_delayed_op(xlator_t *this, afr_lock_t *lock, fd_t *fd) +{ + afr_local_t *local = NULL; + + if (lock->delay_timer) { + local = list_entry(lock->post_op.next, afr_local_t, + transaction.owner_list); + if (fd == local->fd) { + if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) { + local = NULL; + } else { + lock->delay_timer = NULL; + } + } else { + local = NULL; + } + } + + return local; } +void +afr_delayed_changelog_wake_resume(xlator_t *this, inode_t *inode, + call_stub_t *stub) +{ + afr_inode_ctx_t *ctx = NULL; + afr_lock_t *lock = NULL; + afr_local_t *metadata_local = NULL; + afr_local_t *data_local = NULL; + LOCK(&inode->lock); + { + (void)__afr_inode_ctx_get(this, inode, &ctx); + lock = &ctx->lock[AFR_DATA_TRANSACTION]; + data_local = afr_wakeup_same_fd_delayed_op(this, lock, stub->args.fd); + lock = &ctx->lock[AFR_METADATA_TRANSACTION]; + metadata_local = afr_wakeup_same_fd_delayed_op(this, lock, + stub->args.fd); + } + UNLOCK(&inode->lock); + + if (data_local) { + data_local->transaction.resume_stub = stub; + } else if (metadata_local) { + metadata_local->transaction.resume_stub = stub; + } else { + call_resume(stub); + } + if (data_local) { + afr_delayed_changelog_wake_up_cbk(data_local); + } + if (metadata_local) { + afr_delayed_changelog_wake_up_cbk(metadata_local); + } +} int -afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) +afr_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; + afr_local_t *local = NULL; + call_stub_t *stub = NULL; + int op_errno = ENOMEM; - int ret = -1; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; + local->op = GF_FOP_FLUSH; + if (!afr_is_consistent_io_possible(local, this->private, &op_errno)) + goto out; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local->fd = fd_ref(fd); - priv = this->private; + stub = fop_flush_stub(frame, afr_flush_wrapper, fd, xdata); + if (!stub) + goto out; - ALLOC_OR_GOTO (local, afr_local_t, out); + afr_delayed_changelog_wake_resume(this, fd->inode, stub); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; + return 0; +out: + AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL); + return 0; +} + +int +afr_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + int call_count = -1; + + local = frame->local; + + LOCK(&frame->lock); + { + if (op_ret == 0) { + local->op_ret = 0; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); + } else { + local->op_errno = op_errno; } + call_count = --local->call_count; + } + UNLOCK(&frame->lock); - call_count = local->call_count; - frame->local = local; + if (call_count == 0) + AFR_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno, + local->xdata_rsp); - local->fd = fd_ref (fd); - local->cont.fsync.ino = fd->inode->ino; + return 0; +} - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_fsync_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fsync, - fd, datasync); - if (!--call_count) - break; - } +int +afr_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int32_t call_count = 0; + int32_t op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_FSYNCDIR; + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; + + call_count = local->call_count; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND(frame, afr_fsyncdir_cbk, priv->children[i], + priv->children[i]->fops->fsyncdir, fd, datasync, xdata); + if (!--call_count) + break; } + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, NULL, NULL); - } - return 0; + AFR_STACK_UNWIND(fsyncdir, frame, -1, op_errno, NULL); + + return 0; } /* }}} */ -/* {{{ fsync */ +static int +afr_serialized_lock_wind(call_frame_t *frame, xlator_t *this); -int32_t -afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +static gf_boolean_t +afr_is_conflicting_lock_present(int32_t op_ret, int32_t op_errno) { - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; + if (op_ret == -1 && op_errno == EAGAIN) + return _gf_true; + return _gf_false; +} - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; +static void +afr_fop_lock_unwind(call_frame_t *frame, glusterfs_fop_t op, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + switch (op) { + case GF_FOP_INODELK: + AFR_STACK_UNWIND(inodelk, frame, op_ret, op_errno, xdata); + break; + case GF_FOP_FINODELK: + AFR_STACK_UNWIND(finodelk, frame, op_ret, op_errno, xdata); + break; + case GF_FOP_ENTRYLK: + AFR_STACK_UNWIND(entrylk, frame, op_ret, op_errno, xdata); + break; + case GF_FOP_FENTRYLK: + AFR_STACK_UNWIND(fentrylk, frame, op_ret, op_errno, xdata); + break; + default: + break; + } +} - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); +static void +afr_fop_lock_wind(call_frame_t *frame, xlator_t *this, int child_index, + int32_t (*lock_cbk)(call_frame_t *, void *, xlator_t *, + int32_t, int32_t, dict_t *)) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int i = child_index; + + switch (local->op) { + case GF_FOP_INODELK: + STACK_WIND_COOKIE( + frame, lock_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->inodelk, + (const char *)local->cont.inodelk.volume, &local->loc, + local->cont.inodelk.cmd, &local->cont.inodelk.flock, + local->cont.inodelk.xdata); + break; + case GF_FOP_FINODELK: + STACK_WIND_COOKIE( + frame, lock_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->finodelk, + (const char *)local->cont.inodelk.volume, local->fd, + local->cont.inodelk.cmd, &local->cont.inodelk.flock, + local->cont.inodelk.xdata); + break; + case GF_FOP_ENTRYLK: + STACK_WIND_COOKIE( + frame, lock_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->entrylk, local->cont.entrylk.volume, + &local->loc, local->cont.entrylk.basename, + local->cont.entrylk.cmd, local->cont.entrylk.type, + local->cont.entrylk.xdata); + break; + case GF_FOP_FENTRYLK: + STACK_WIND_COOKIE( + frame, lock_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->fentrylk, local->cont.entrylk.volume, + local->fd, local->cont.entrylk.basename, + local->cont.entrylk.cmd, local->cont.entrylk.type, + local->cont.entrylk.xdata); + break; + default: + break; + } +} - call_count = afr_frame_return (frame); +void +afr_fop_lock_proceed(call_frame_t *frame) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - if (call_count == 0) - AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, - local->op_errno); + local = frame->local; + priv = frame->this->private; - return 0; + if (local->fop_lock_state != AFR_FOP_LOCK_PARALLEL) { + afr_fop_lock_unwind(frame, local->op, local->op_ret, local->op_errno, + local->xdata_rsp); + return; + } + /* At least one child is up */ + /* + * Non-blocking locks also need to be serialized. Otherwise there is + * a chance that both the mounts which issued same non-blocking inodelk + * may endup not acquiring the lock on any-brick. + * Ex: Mount1 and Mount2 + * request for full length lock on file f1. Mount1 afr may acquire the + * partial lock on brick-1 and may not acquire the lock on brick-2 + * because Mount2 already got the lock on brick-2, vice versa. Since + * both the mounts only got partial locks, afr treats them as failure in + * gaining the locks and unwinds with EAGAIN errno. + */ + local->op_ret = -1; + local->op_errno = EUCLEAN; + local->fop_lock_state = AFR_FOP_LOCK_SERIAL; + afr_local_replies_wipe(local, priv); + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); + local->xdata_rsp = NULL; + switch (local->op) { + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + local->cont.inodelk.cmd = local->cont.inodelk.in_cmd; + local->cont.inodelk.flock = local->cont.inodelk.in_flock; + if (local->cont.inodelk.xdata) + dict_unref(local->cont.inodelk.xdata); + local->cont.inodelk.xdata = NULL; + if (local->xdata_req) + local->cont.inodelk.xdata = dict_ref(local->xdata_req); + break; + case GF_FOP_ENTRYLK: + case GF_FOP_FENTRYLK: + local->cont.entrylk.cmd = local->cont.entrylk.in_cmd; + if (local->cont.entrylk.xdata) + dict_unref(local->cont.entrylk.xdata); + local->cont.entrylk.xdata = NULL; + if (local->xdata_req) + local->cont.entrylk.xdata = dict_ref(local->xdata_req); + break; + default: + break; + } + afr_serialized_lock_wind(frame, frame->this); } +static int32_t +afr_unlock_partial_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) -int32_t -afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int child_index = (long)cookie; + uuid_t gfid = {0}; - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; + local = frame->local; + priv = this->private; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + if (op_ret < 0 && op_errno != ENOTCONN) { + if (local->fd) + gf_uuid_copy(gfid, local->fd->inode->gfid); + else + loc_gfid(&local->loc, gfid); + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL, + "%s: Failed to unlock %s on %s " + "with lk_owner: %s", + uuid_utoa(gfid), gf_fop_list[local->op], + priv->children[child_index]->name, + lkowner_utoa(&frame->root->lk_owner)); + } + + call_count = afr_frame_return(frame); + if (call_count == 0) + afr_fop_lock_proceed(frame); + + return 0; +} - priv = this->private; +static int32_t +afr_unlock_locks_and_proceed(call_frame_t *frame, xlator_t *this, + int call_count) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + if (call_count == 0) { + afr_fop_lock_proceed(frame); + goto out; + } + + local = frame->local; + priv = this->private; + local->call_count = call_count; + switch (local->op) { + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + local->cont.inodelk.flock.l_type = F_UNLCK; + local->cont.inodelk.cmd = F_SETLK; + if (local->cont.inodelk.xdata) + dict_unref(local->cont.inodelk.xdata); + local->cont.inodelk.xdata = NULL; + break; + case GF_FOP_ENTRYLK: + case GF_FOP_FENTRYLK: + local->cont.entrylk.cmd = ENTRYLK_UNLOCK; + if (local->cont.entrylk.xdata) + dict_unref(local->cont.entrylk.xdata); + local->cont.entrylk.xdata = NULL; + break; + default: + break; + } - ALLOC_OR_GOTO (local, afr_local_t, out); + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + if (local->replies[i].op_ret == -1) + continue; - call_count = local->call_count; - frame->local = local; + afr_fop_lock_wind(frame, this, i, afr_unlock_partial_lock_cbk); - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_fsyncdir_cbk, - priv->children[i], - priv->children[i]->fops->fsyncdir, - fd, datasync); - if (!--call_count) - break; - } - } + if (!--call_count) + break; + } - op_ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fsyncdir, frame, op_ret, op_errno); - } - return 0; + return 0; } -/* }}} */ - -/* {{{ xattrop */ - int32_t -afr_xattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) +afr_fop_lock_done(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + int i = 0; + int lock_count = 0; + unsigned char *success = NULL; - int call_count = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; + local = frame->local; + priv = this->private; + success = alloca0(priv->child_count); - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; - local->op_errno = op_errno; + if (local->replies[i].op_ret == 0) { + lock_count++; + success[i] = 1; } - UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + if (local->op_ret == -1 && local->op_errno == EAGAIN) + continue; - if (call_count == 0) - AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno, - xattr); + if ((local->replies[i].op_ret == -1) && + (local->replies[i].op_errno == EAGAIN)) { + local->op_ret = -1; + local->op_errno = EAGAIN; + continue; + } - return 0; -} + if (local->replies[i].op_ret == 0) + local->op_ret = 0; + local->op_errno = local->replies[i].op_errno; + } -int32_t -afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; + if (afr_fop_lock_is_unlock(frame)) + goto unwind; - int ret = -1; + if (afr_is_conflicting_lock_present(local->op_ret, local->op_errno)) { + afr_unlock_locks_and_proceed(frame, this, lock_count); + } else if (priv->quorum_count && !afr_has_quorum(success, this, NULL)) { + local->fop_lock_state = AFR_FOP_LOCK_QUORUM_FAILED; + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + if (local->op_errno == 0) + local->op_errno = afr_quorum_errno(priv); + afr_unlock_locks_and_proceed(frame, this, lock_count); + } else { + goto unwind; + } + + return 0; +unwind: + afr_fop_lock_unwind(frame, local->op, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; +} - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; +static int +afr_common_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = (long)cookie; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local = frame->local; - priv = this->private; + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret == 0 && xdata) { + local->replies[child_index].xdata = dict_ref(xdata); + LOCK(&frame->lock); + { + if (!local->xdata_rsp) + local->xdata_rsp = dict_ref(xdata); + } + UNLOCK(&frame->lock); + } + return 0; +} - ALLOC_OR_GOTO (local, afr_local_t, out); +static int32_t +afr_serialized_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = (long)cookie; + int next_child = 0; + + local = frame->local; + priv = this->private; + + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); + + for (next_child = child_index + 1; next_child < priv->child_count; + next_child++) { + if (local->child_up[next_child]) + break; + } + + if (afr_is_conflicting_lock_present(op_ret, op_errno) || + (next_child == priv->child_count)) { + afr_fop_lock_done(frame, this); + } else { + afr_fop_lock_wind(frame, this, next_child, afr_serialized_lock_cbk); + } + + return 0; +} - call_count = local->call_count; - frame->local = local; +static int +afr_serialized_lock_wind(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_xattrop_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - loc, optype, xattr); - if (!--call_count) - break; - } - } + priv = this->private; + local = frame->local; - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (xattrop, frame, op_ret, op_errno, NULL); + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + afr_fop_lock_wind(frame, this, i, afr_serialized_lock_cbk); + break; } - return 0; + } + return 0; } -/* }}} */ +static int32_t +afr_parallel_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) -/* {{{ fxattrop */ +{ + int call_count = 0; -int32_t -afr_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); + + call_count = afr_frame_return(frame); + if (call_count == 0) + afr_fop_lock_done(frame, this); + + return 0; +} + +static int +afr_parallel_lock_wind(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int call_count = 0; + int i = 0; + + priv = this->private; + local = frame->local; + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + afr_fop_lock_wind(frame, this, i, afr_parallel_lock_cbk); + if (!--call_count) + break; + } + return 0; +} - int call_count = -1; +static int +afr_fop_handle_lock(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = frame->local; + int op_errno = 0; - local = frame->local; + if (!afr_fop_lock_is_unlock(frame)) { + if (!afr_is_consistent_io_possible(local, this->private, &op_errno)) + goto out; - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; + switch (local->op) { + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + local->cont.inodelk.cmd = F_SETLK; + break; + case GF_FOP_ENTRYLK: + case GF_FOP_FENTRYLK: + local->cont.entrylk.cmd = ENTRYLK_LOCK_NB; + break; + default: + break; + } + } - local->op_errno = op_errno; + if (local->xdata_req) { + switch (local->op) { + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + local->cont.inodelk.xdata = dict_ref(local->xdata_req); + break; + case GF_FOP_ENTRYLK: + case GF_FOP_FENTRYLK: + local->cont.entrylk.xdata = dict_ref(local->xdata_req); + break; + default: + break; } - UNLOCK (&frame->lock); + } - call_count = afr_frame_return (frame); + local->fop_lock_state = AFR_FOP_LOCK_PARALLEL; + afr_parallel_lock_wind(frame, this); +out: + return -op_errno; +} - if (call_count == 0) - AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno, - xattr); +static int32_t +afr_handle_inodelk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + const char *volume, loc_t *loc, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t op_errno = ENOMEM; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = fop; + if (loc) + loc_copy(&local->loc, loc); + if (fd && (flock->l_type != F_UNLCK)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local->fd = fd_ref(fd); + } + + local->cont.inodelk.volume = gf_strdup(volume); + if (!local->cont.inodelk.volume) { + op_errno = ENOMEM; + goto out; + } + + local->cont.inodelk.in_cmd = cmd; + local->cont.inodelk.cmd = cmd; + local->cont.inodelk.in_flock = *flock; + local->cont.inodelk.flock = *flock; + if (xdata) + local->xdata_req = dict_ref(xdata); + + op_errno = -afr_fop_handle_lock(frame, frame->this); + if (op_errno) + goto out; + return 0; +out: + afr_fop_lock_unwind(frame, fop, -1, op_errno, NULL); - return 0; + return 0; } +int32_t +afr_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) +{ + afr_handle_inodelk(frame, this, GF_FOP_INODELK, volume, loc, NULL, cmd, + flock, xdata); + return 0; +} int32_t -afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr) +afr_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; + afr_handle_inodelk(frame, this, GF_FOP_FINODELK, volume, NULL, fd, cmd, + flock, xdata); + return 0; +} - int ret = -1; +static int +afr_handle_entrylk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + const char *volume, loc_t *loc, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t op_errno = ENOMEM; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = fop; + if (loc) + loc_copy(&local->loc, loc); + if (fd && (cmd != ENTRYLK_UNLOCK)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local->fd = fd_ref(fd); + } + local->cont.entrylk.cmd = cmd; + local->cont.entrylk.in_cmd = cmd; + local->cont.entrylk.type = type; + local->cont.entrylk.volume = gf_strdup(volume); + local->cont.entrylk.basename = gf_strdup(basename); + if (!local->cont.entrylk.volume || !local->cont.entrylk.basename) { + op_errno = ENOMEM; + goto out; + } + if (xdata) + local->xdata_req = dict_ref(xdata); + op_errno = -afr_fop_handle_lock(frame, frame->this); + if (op_errno) + goto out; + + return 0; +out: + afr_fop_lock_unwind(frame, fop, -1, op_errno, NULL); + return 0; +} - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; +int +afr_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) +{ + afr_handle_entrylk(frame, this, GF_FOP_ENTRYLK, volume, loc, NULL, basename, + cmd, type, xdata); + return 0; +} - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); +int +afr_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) +{ + afr_handle_entrylk(frame, this, GF_FOP_FENTRYLK, volume, NULL, fd, basename, + cmd, type, xdata); + return 0; +} - priv = this->private; +int +afr_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *statvfs, dict_t *xdata) +{ + afr_local_t *local = NULL; + int call_count = 0; + struct statvfs *buf = NULL; - ALLOC_OR_GOTO (local, afr_local_t, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; + LOCK(&frame->lock); + { + if (op_ret != 0) { + local->op_errno = op_errno; + goto unlock; } - call_count = local->call_count; - frame->local = local; + local->op_ret = op_ret; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_fxattrop_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - fd, optype, xattr); - if (!--call_count) - break; + buf = &local->cont.statfs.buf; + if (local->cont.statfs.buf_set) { + if (statvfs->f_bavail < buf->f_bavail) { + *buf = *statvfs; + if (xdata) { + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); + local->xdata_rsp = dict_ref(xdata); } + } + } else { + *buf = *statvfs; + local->cont.statfs.buf_set = 1; + if (xdata) + local->xdata_rsp = dict_ref(xdata); } + } +unlock: + call_count = --local->call_count; + UNLOCK(&frame->lock); - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, NULL); - } - return 0; + if (call_count == 0) + AFR_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno, + &local->cont.statfs.buf, local->xdata_rsp); + + return 0; } -/* }}} */ +int +afr_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + int32_t op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_STATFS; + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; + + if (priv->arbiter_count == 1 && local->child_up[ARBITER_BRICK_INDEX]) + local->call_count--; + call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + if (AFR_IS_ARBITER_BRICK(priv, i)) + continue; + STACK_WIND(frame, afr_statfs_cbk, priv->children[i], + priv->children[i]->fops->statfs, loc, xdata); + if (!--call_count) + break; + } + } + return 0; +out: + AFR_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL); + + return 0; +} int32_t -afr_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +afr_lk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = this->private; + int call_count = -1; + int child_index = (long)cookie; + + local = frame->local; + + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL, + "gfid=%s: unlock failed on subvolume %s " + "with lock owner %s", + uuid_utoa(local->fd->inode->gfid), + priv->children[child_index]->name, + lkowner_utoa(&frame->root->lk_owner)); + } + + call_count = afr_frame_return(frame); + if (call_count == 0) { + AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL, + local->xdata_rsp); + } + + return 0; +} +int32_t +afr_lk_unlock(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; - int call_count = -1; + local = frame->local; + priv = this->private; - local = frame->local; + call_count = afr_locked_nodes_count(local->cont.lk.locked_nodes, + priv->child_count); - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; + if (call_count == 0) { + AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL, + local->xdata_rsp); + return 0; + } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + local->call_count = call_count; - call_count = afr_frame_return (frame); + local->cont.lk.user_flock.l_type = F_UNLCK; - if (call_count == 0) - AFR_STACK_UNWIND (inodelk, frame, local->op_ret, - local->op_errno); + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lk.locked_nodes[i]) { + STACK_WIND_COOKIE(frame, afr_lk_unlock_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->lk, + local->fd, F_SETLK, &local->cont.lk.user_flock, + NULL); - return 0; -} + if (!--call_count) + break; + } + } + return 0; +} int32_t -afr_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock) +afr_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = -1; - int ret = -1; + local = frame->local; + priv = this->private; - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; + child_index = (long)cookie; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); + if (op_ret < 0 && op_errno == EAGAIN) { + local->op_ret = -1; + local->op_errno = EAGAIN; - priv = this->private; + afr_lk_unlock(frame, this); + return 0; + } + + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lk.locked_nodes[child_index] = 1; + local->cont.lk.ret_flock = *lock; + } + + child_index++; + + if (child_index < priv->child_count) { + STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)child_index, + priv->children[child_index], + priv->children[child_index]->fops->lk, local->fd, + local->cont.lk.cmd, &local->cont.lk.user_flock, + local->xdata_req); + } else if (priv->quorum_count && + !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) { + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); - ALLOC_OR_GOTO (local, afr_local_t, out); + afr_lk_unlock(frame, this); + } else { + if (local->op_ret < 0) + local->op_errno = afr_final_errno(local, priv); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, + &local->cont.lk.ret_flock, local->xdata_rsp); + } - call_count = local->call_count; - frame->local = local; + return 0; +} - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_inodelk_cbk, - priv->children[i], - priv->children[i]->fops->inodelk, - volume, loc, cmd, flock); - - if (!--call_count) - break; - } - } +int +afr_lk_transaction_cbk(int ret, call_frame_t *frame, void *opaque) +{ + return 0; +} - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (inodelk, frame, op_ret, op_errno); - } - return 0; +int +afr_lk_txn_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = -1; + + local = frame->local; + child_index = (long)cookie; + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lk.locked_nodes[child_index] = 1; + local->cont.lk.ret_flock = *lock; + } + syncbarrier_wake(&local->barrier); + return 0; } +int +afr_lk_txn_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int child_index = (long)cookie; + + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL, + "gfid=%s: unlock failed on subvolume %s " + "with lock owner %s", + uuid_utoa(local->fd->inode->gfid), + priv->children[child_index]->name, + lkowner_utoa(&frame->root->lk_owner)); + } + return 0; +} +int +afr_lk_transaction(void *opaque) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + char *wind_on = NULL; + int op_errno = 0; + int i = 0; + int ret = 0; + + frame = (call_frame_t *)opaque; + local = frame->local; + this = frame->this; + priv = this->private; + wind_on = alloca0(priv->child_count); + + if (priv->arbiter_count || priv->child_count != 3) { + op_errno = ENOTSUP; + gf_msg(frame->this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Lock healing supported only for replica 3 volumes.", + uuid_utoa(local->fd->inode->gfid)); + goto err; + } + + op_errno = -afr_dom_lock_acquire(frame); // Released during + // AFR_STACK_UNWIND + if (op_errno != 0) { + goto err; + } + if (priv->quorum_count && + !afr_has_quorum(local->cont.lk.dom_locked_nodes, this, NULL)) { + op_errno = afr_final_errno(local, priv); + goto err; + } + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] && local->cont.lk.dom_locked_nodes[i]) + wind_on[i] = 1; + } + AFR_ONLIST(wind_on, frame, afr_lk_txn_wind_cbk, lk, local->fd, + local->cont.lk.cmd, &local->cont.lk.user_flock, + local->xdata_req); + + if (priv->quorum_count && + !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) { + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + goto unlock; + } else { + if (local->cont.lk.user_flock.l_type == F_UNLCK) + ret = afr_remove_lock_from_saved_locks(local, this); + else + ret = afr_add_lock_to_saved_locks(frame, this); + if (ret) { + local->op_ret = -1; + local->op_errno = -ret; + goto unlock; + } + AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, + &local->cont.lk.ret_flock, local->xdata_rsp); + } -int32_t -afr_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + return 0; +unlock: + local->cont.lk.user_flock.l_type = F_UNLCK; + AFR_ONLIST(local->cont.lk.locked_nodes, frame, afr_lk_txn_unlock_cbk, lk, + local->fd, F_SETLK, &local->cont.lk.user_flock, NULL); +err: + AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); + return -1; +} + +int +afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) { - afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int ret = 0; + int i = 0; + int32_t op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_LK; + if (!afr_lk_is_unlock(cmd, flock)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; + } + + local->cont.lk.locked_nodes = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.locked_nodes), + gf_afr_mt_char); + + if (!local->cont.lk.locked_nodes) { + op_errno = ENOMEM; + goto out; + } + + local->fd = fd_ref(fd); + local->cont.lk.cmd = cmd; + local->cont.lk.user_flock = *flock; + local->cont.lk.ret_flock = *flock; + if (xdata) + local->xdata_req = dict_ref(xdata); + + if (afr_is_lock_mode_mandatory(xdata)) { + ret = synctask_new(this->ctx->env, afr_lk_transaction, + afr_lk_transaction_cbk, frame, frame); + if (ret) { + op_errno = ENOMEM; + goto out; + } + return 0; + } - int call_count = -1; + STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)0, priv->children[i], + priv->children[i]->fops->lk, fd, cmd, flock, + local->xdata_req); - local = frame->local; + return 0; +out: + AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; + return 0; +} - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); +int32_t +afr_lease_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_lease *lease, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int call_count = -1; - call_count = afr_frame_return (frame); + local = frame->local; + call_count = afr_frame_return(frame); - if (call_count == 0) - AFR_STACK_UNWIND (finodelk, frame, local->op_ret, - local->op_errno); + if (call_count == 0) + AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno, lease, + xdata); - return 0; + return 0; } - int32_t -afr_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock) +afr_lease_unlock(call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; - int ret = -1; + local = frame->local; + priv = this->private; - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; + call_count = afr_locked_nodes_count(local->cont.lease.locked_nodes, + priv->child_count); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + if (call_count == 0) { + AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno, + &local->cont.lease.ret_lease, NULL); + return 0; + } - priv = this->private; + local->call_count = call_count; - ALLOC_OR_GOTO (local, afr_local_t, out); + local->cont.lease.user_lease.cmd = GF_UNLK_LEASE; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lease.locked_nodes[i]) { + STACK_WIND(frame, afr_lease_unlock_cbk, priv->children[i], + priv->children[i]->fops->lease, &local->loc, + &local->cont.lease.user_lease, NULL); + + if (!--call_count) + break; } + } - call_count = local->call_count; - frame->local = local; + return 0; +} - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_finodelk_cbk, - priv->children[i], - priv->children[i]->fops->finodelk, - volume, fd, cmd, flock); - - if (!--call_count) - break; - } - } +int32_t +afr_lease_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_lease *lease, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = -1; - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (finodelk, frame, op_ret, op_errno); - } + local = frame->local; + priv = this->private; + + child_index = (long)cookie; + + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); + if (op_ret < 0 && op_errno == EAGAIN) { + local->op_ret = -1; + local->op_errno = EAGAIN; + + afr_lease_unlock(frame, this); return 0; -} + } + + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lease.locked_nodes[child_index] = 1; + local->cont.lease.ret_lease = *lease; + } + + child_index++; + if (child_index < priv->child_count) { + STACK_WIND_COOKIE(frame, afr_lease_cbk, (void *)(long)child_index, + priv->children[child_index], + priv->children[child_index]->fops->lease, &local->loc, + &local->cont.lease.user_lease, xdata); + } else if (priv->quorum_count && + !afr_has_quorum(local->cont.lease.locked_nodes, this, NULL)) { + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + afr_lease_unlock(frame, this); + } else { + if (local->op_ret < 0) + local->op_errno = afr_final_errno(local, priv); + AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno, + &local->cont.lease.ret_lease, NULL); + } -int32_t -afr_entrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + return 0; +} +int +afr_lease(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct gf_lease *lease, dict_t *xdata) { - afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int32_t op_errno = ENOMEM; - int call_count = -1; + priv = this->private; - local = frame->local; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; + local->op = GF_FOP_LEASE; + local->cont.lease.locked_nodes = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lease.locked_nodes), + gf_afr_mt_char); - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + if (!local->cont.lease.locked_nodes) { + op_errno = ENOMEM; + goto out; + } - call_count = afr_frame_return (frame); + loc_copy(&local->loc, loc); + local->cont.lease.user_lease = *lease; + local->cont.lease.ret_lease = *lease; - if (call_count == 0) - AFR_STACK_UNWIND (entrylk, frame, local->op_ret, - local->op_errno); + STACK_WIND_COOKIE(frame, afr_lease_cbk, (void *)(long)0, priv->children[0], + priv->children[0]->fops->lease, loc, lease, xdata); - return 0; -} + return 0; +out: + AFR_STACK_UNWIND(lease, frame, -1, op_errno, NULL, NULL); + return 0; +} -int32_t -afr_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, - const char *basename, entrylk_cmd cmd, entrylk_type type) +int +afr_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; + afr_local_t *local = NULL; + int child_index = (long)cookie; + int call_count = 0; + gf_boolean_t failed = _gf_false; + gf_boolean_t succeeded = _gf_false; + int i = 0; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (xdata) + local->replies[child_index].xdata = dict_ref(xdata); + + call_count = afr_frame_return(frame); + if (call_count) + goto out; + /* If any of the subvolumes failed with other than ENOTCONN + * return error else return success unless all the subvolumes + * failed. + * TODO: In case of failure, we need to unregister the xattrs + * from the other subvolumes where it succeeded (once upcall + * fixes the Bz-1371622)*/ + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0 && + local->replies[i].op_errno != ENOTCONN) { + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); + local->xdata_rsp = NULL; + if (local->replies[i].xdata) { + local->xdata_rsp = dict_ref(local->replies[i].xdata); + } + failed = _gf_true; + break; + } + if (local->replies[i].op_ret == 0) { + succeeded = _gf_true; + local->op_ret = 0; + local->op_errno = 0; + if (!local->xdata_rsp && local->replies[i].xdata) { + local->xdata_rsp = dict_ref(local->replies[i].xdata); + } + } + } + + if (!succeeded && !failed) { + local->op_ret = -1; + local->op_errno = ENOTCONN; + } - int ret = -1; + AFR_STACK_UNWIND(ipc, frame, local->op_ret, local->op_errno, + local->xdata_rsp); - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; +out: + return 0; +} + +int +afr_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t op_errno = -1; + afr_private_t *priv = NULL; + int i = 0; + int call_cnt = -1; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); - priv = this->private; + if (op != GF_IPC_TARGET_UPCALL) + goto wind_default; - ALLOC_OR_GOTO (local, afr_local_t, out); + VALIDATE_OR_GOTO(this->private, err); + priv = this->private; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto err; - call_count = local->call_count; - frame->local = local; + call_cnt = local->call_count; + if (xdata) { for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_entrylk_cbk, - priv->children[i], - priv->children[i]->fops->entrylk, - volume, loc, basename, cmd, type); - - if (!--call_count) - break; - } + if (dict_set_int8(xdata, priv->pending_key[i], 0) < 0) + goto err; } + } + + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + + STACK_WIND_COOKIE(frame, afr_ipc_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->ipc, op, + xdata); + if (!--call_cnt) + break; + } + return 0; + +err: + if (op_errno == -1) + op_errno = errno; + AFR_STACK_UNWIND(ipc, frame, -1, op_errno, NULL); + + return 0; + +wind_default: + STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ipc, op, xdata); + return 0; +} - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (entrylk, frame, op_ret, op_errno); - } +int +afr_forget(xlator_t *this, inode_t *inode) +{ + uint64_t ctx_int = 0; + afr_inode_ctx_t *ctx = NULL; + + afr_spb_choice_timeout_cancel(this, inode); + inode_ctx_del(inode, this, &ctx_int); + if (!ctx_int) return 0; + + ctx = (afr_inode_ctx_t *)(uintptr_t)ctx_int; + afr_inode_ctx_destroy(ctx); + return 0; } +int +afr_priv_dump(xlator_t *this) +{ + afr_private_t *priv = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + + GF_ASSERT(this); + priv = this->private; + + GF_ASSERT(priv); + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); + gf_proc_dump_add_section("%s", key_prefix); + gf_proc_dump_write("child_count", "%u", priv->child_count); + for (i = 0; i < priv->child_count; i++) { + sprintf(key, "child_up[%d]", i); + gf_proc_dump_write(key, "%d", priv->child_up[i]); + sprintf(key, "pending_key[%d]", i); + gf_proc_dump_write(key, "%s", priv->pending_key[i]); + sprintf(key, "pending_reads[%d]", i); + gf_proc_dump_write(key, "%" PRId64, + GF_ATOMIC_GET(priv->pending_reads[i])); + sprintf(key, "child_latency[%d]", i); + gf_proc_dump_write(key, "%" PRId64, priv->child_latency[i]); + sprintf(key, "halo_child_up[%d]", i); + gf_proc_dump_write(key, "%d", priv->halo_child_up[i]); + } + gf_proc_dump_write("data_self_heal", "%d", priv->data_self_heal); + gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal); + gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal); + gf_proc_dump_write("read_child", "%d", priv->read_child); + gf_proc_dump_write("wait_count", "%u", priv->wait_count); + gf_proc_dump_write("heal-wait-queue-length", "%d", priv->heal_wait_qlen); + gf_proc_dump_write("heal-waiters", "%d", priv->heal_waiters); + gf_proc_dump_write("background-self-heal-count", "%d", + priv->background_self_heal_count); + gf_proc_dump_write("healers", "%d", priv->healers); + gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode); + gf_proc_dump_write("use-anonymous-inode", "%d", priv->use_anon_inode); + if (priv->quorum_count == AFR_QUORUM_AUTO) { + gf_proc_dump_write("quorum-type", "auto"); + } else if (priv->quorum_count == 0) { + gf_proc_dump_write("quorum-type", "none"); + } else { + gf_proc_dump_write("quorum-type", "fixed"); + gf_proc_dump_write("quorum-count", "%d", priv->quorum_count); + } + gf_proc_dump_write("up", "%u", afr_has_quorum(priv->child_up, this, NULL)); + if (priv->thin_arbiter_count) { + gf_proc_dump_write("ta_child_up", "%d", priv->ta_child_up); + gf_proc_dump_write("ta_bad_child_index", "%d", + priv->ta_bad_child_index); + gf_proc_dump_write("ta_notify_dom_lock_offset", "%" PRId64, + priv->ta_notify_dom_lock_offset); + } + + return 0; +} +/** + * find_child_index - find the child's index in the array of subvolumes + * @this: AFR + * @child: child + */ -int32_t -afr_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +static int +afr_find_child_index(xlator_t *this, xlator_t *child) +{ + afr_private_t *priv = NULL; + int child_count = -1; + int i = -1; + + priv = this->private; + child_count = priv->child_count; + if (priv->thin_arbiter_count) { + child_count++; + } + + for (i = 0; i < child_count; i++) { + if ((xlator_t *)child == priv->children[i]) + break; + } + + return i; +} +int +__afr_get_up_children_count(afr_private_t *priv) { - afr_local_t *local = NULL; + int up_children = 0; + int i = 0; - int call_count = -1; + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 1) + up_children++; - local = frame->local; + return up_children; +} - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; +static int +__get_heard_from_all_status(xlator_t *this) +{ + afr_private_t *priv = this->private; + int i; - local->op_errno = op_errno; + for (i = 0; i < priv->child_count; i++) { + if (!priv->last_event[i]) { + return 0; } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); + } + if (priv->thin_arbiter_count && !priv->ta_child_up) { + return 0; + } + return 1; +} - if (call_count == 0) - AFR_STACK_UNWIND (fentrylk, frame, local->op_ret, - local->op_errno); +glusterfs_event_t +__afr_transform_event_from_state(xlator_t *this) +{ + int i = 0; + int up_children = 0; + afr_private_t *priv = this->private; + + if (__get_heard_from_all_status(this)) + /* have_heard_from_all. Let afr_notify() do the propagation. */ + return GF_EVENT_MAXVAL; + + up_children = __afr_get_up_children_count(priv); + /* Treat the children with pending notification, as having sent a + * GF_EVENT_CHILD_DOWN. i.e. set the event as GF_EVENT_SOME_DESCENDENT_DOWN, + * as done in afr_notify() */ + for (i = 0; i < priv->child_count; i++) { + if (priv->last_event[i]) + continue; + priv->last_event[i] = GF_EVENT_SOME_DESCENDENT_DOWN; + priv->child_up[i] = 0; + } + + if (up_children) + /* We received at least one child up */ + return GF_EVENT_CHILD_UP; + else + return GF_EVENT_CHILD_DOWN; + + return GF_EVENT_MAXVAL; +} - return 0; +static void +afr_notify_cbk(void *data) +{ + xlator_t *this = data; + afr_private_t *priv = this->private; + glusterfs_event_t event = GF_EVENT_MAXVAL; + gf_boolean_t propagate = _gf_false; + + LOCK(&priv->lock); + { + if (!priv->timer) { + /* + * Either child_up/child_down is already sent to parent. + * This is a spurious wake up. + */ + goto unlock; + } + priv->timer = NULL; + event = __afr_transform_event_from_state(this); + if (event != GF_EVENT_MAXVAL) + propagate = _gf_true; + } +unlock: + UNLOCK(&priv->lock); + if (propagate) + default_notify(this, event, NULL); } +static void +__afr_launch_notify_timer(xlator_t *this, afr_private_t *priv) +{ + struct timespec delay = { + 0, + }; + + gf_msg_debug(this->name, 0, "Initiating child-down timer"); + delay.tv_sec = 10; + delay.tv_nsec = 0; + priv->timer = gf_timer_call_after(this->ctx, delay, afr_notify_cbk, this); + if (priv->timer == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_TIMER_CREATE_FAIL, + "Cannot create timer for delayed initialization"); + } +} -int32_t -afr_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, - const char *basename, entrylk_cmd cmd, entrylk_type type) +static int +find_best_down_child(xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = -1; + int32_t best_child = -1; + int64_t best_latency = INT64_MAX; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!priv->child_up[i] && priv->child_latency[i] >= 0 && + priv->child_latency[i] < best_latency) { + best_child = i; + best_latency = priv->child_latency[i]; + } + } + if (best_child >= 0) { + gf_msg_debug(this->name, 0, + "Found best down child (%d) @ %" PRId64 " ms latency", + best_child, best_latency); + } + return best_child; +} - int ret = -1; +int +find_worst_up_child(xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = -1; + int32_t worst_child = -1; + int64_t worst_latency = INT64_MIN; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] && priv->child_latency[i] >= 0 && + priv->child_latency[i] > worst_latency) { + worst_child = i; + worst_latency = priv->child_latency[i]; + } + } + if (worst_child >= 0) { + gf_msg_debug(this->name, 0, + "Found worst up child (%d) @ %" PRId64 " ms latency", + worst_child, worst_latency); + } + return worst_child; +} - int i = 0; - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; +void +__afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx, + int64_t halo_max_latency_msec, int32_t *event, + int64_t child_latency_msec) +{ + afr_private_t *priv = NULL; + int up_children = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + priv = this->private; - priv = this->private; + priv->child_latency[idx] = child_latency_msec; + gf_msg_debug(child_xlator->name, 0, "Client ping @ %" PRId64 " ms", + child_latency_msec); + if (priv->shd.iamshd) + return; - ALLOC_OR_GOTO (local, afr_local_t, out); + up_children = __afr_get_up_children_count(priv); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; + if (child_latency_msec > halo_max_latency_msec && + priv->child_up[idx] == 1 && up_children > priv->halo_min_replicas) { + if ((up_children - 1) < priv->halo_min_replicas) { + gf_log(child_xlator->name, GF_LOG_INFO, + "Overriding halo threshold, " + "min replicas: %d", + priv->halo_min_replicas); + } else { + gf_log(child_xlator->name, GF_LOG_INFO, + "Child latency (%" PRId64 + " ms) " + "exceeds halo threshold (%" PRId64 + "), " + "marking child down.", + child_latency_msec, halo_max_latency_msec); + if (priv->halo_child_up[idx]) { + *event = GF_EVENT_CHILD_DOWN; + } + } + } else if (child_latency_msec < halo_max_latency_msec && + priv->child_up[idx] == 0) { + if (up_children < priv->halo_max_replicas) { + gf_log(child_xlator->name, GF_LOG_INFO, + "Child latency (%" PRId64 + " ms) " + "below halo threshold (%" PRId64 + "), " + "marking child up.", + child_latency_msec, halo_max_latency_msec); + if (priv->halo_child_up[idx]) { + *event = GF_EVENT_CHILD_UP; + } + } else { + gf_log(child_xlator->name, GF_LOG_INFO, + "Not marking child %d up, " + "max replicas (%d) reached.", + idx, priv->halo_max_replicas); } + } +} - call_count = local->call_count; - frame->local = local; +static int64_t +afr_get_halo_latency(xlator_t *this) +{ + afr_private_t *priv = NULL; + int64_t halo_max_latency_msec = 0; + + priv = this->private; + + if (priv->shd.iamshd) { + halo_max_latency_msec = priv->shd.halo_max_latency_msec; + } else if (priv->nfsd.iamnfsd) { + halo_max_latency_msec = priv->nfsd.halo_max_latency_msec; + } else { + halo_max_latency_msec = priv->halo_max_latency_msec; + } + gf_msg_debug(this->name, 0, "Using halo latency %" PRId64, + halo_max_latency_msec); + return halo_max_latency_msec; +} - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_fentrylk_cbk, - priv->children[i], - priv->children[i]->fops->fentrylk, - volume, fd, basename, cmd, type); - - if (!--call_count) - break; - } +void +__afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator, + const int idx, int64_t child_latency_msec, + int32_t *event, int32_t *call_psh, + int32_t *up_child) +{ + afr_private_t *priv = NULL; + int up_children = 0; + int worst_up_child = -1; + int64_t halo_max_latency_msec = afr_get_halo_latency(this); + + priv = this->private; + + /* + * This only really counts if the child was never up + * (value = -1) or had been down (value = 0). See + * comment at GF_EVENT_CHILD_DOWN for a more detailed + * explanation. + */ + if (priv->child_up[idx] != 1) { + priv->event_generation++; + } + priv->child_up[idx] = 1; + + *call_psh = 1; + *up_child = idx; + up_children = __afr_get_up_children_count(priv); + /* + * If this is an _actual_ CHILD_UP event, we + * want to set the child_latency to MAX to indicate + * the child needs ping data to be available before doing child-up + */ + if (!priv->halo_enabled) + goto out; + + if (child_latency_msec < 0) { + /*set to INT64_MAX-1 so that it is found for best_down_child*/ + priv->halo_child_up[idx] = 1; + if (priv->child_latency[idx] < 0) { + priv->child_latency[idx] = AFR_HALO_MAX_LATENCY; + } + } + + /* + * Handle the edge case where we exceed + * halo_min_replicas and we've got a child which is + * marked up as it was helping to satisfy the + * halo_min_replicas even though it's latency exceeds + * halo_max_latency_msec. + */ + if (up_children > priv->halo_min_replicas) { + worst_up_child = find_worst_up_child(this); + if (worst_up_child >= 0 && + priv->child_latency[worst_up_child] > halo_max_latency_msec) { + gf_msg_debug(this->name, 0, + "Marking child %d down, " + "doesn't meet halo threshold (%" PRId64 + "), and > " + "halo_min_replicas (%d)", + worst_up_child, halo_max_latency_msec, + priv->halo_min_replicas); + priv->child_up[worst_up_child] = 0; + up_children--; } + } - op_ret = 0; + if (up_children > priv->halo_max_replicas && !priv->shd.iamshd) { + worst_up_child = find_worst_up_child(this); + if (worst_up_child < 0) { + worst_up_child = idx; + } + priv->child_up[worst_up_child] = 0; + up_children--; + gf_msg_debug(this->name, 0, + "Marking child %d down, " + "up_children (%d) > halo_max_replicas (%d)", + worst_up_child, up_children, priv->halo_max_replicas); + } out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fentrylk, frame, op_ret, op_errno); + if (up_children == 1) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP, + "Subvolume '%s' came back up; " + "going online.", + child_xlator->name); + gf_event(EVENT_AFR_SUBVOL_UP, "client-pid=%d; subvol=%s", + this->ctx->cmd_args.client_pid, this->name); + } else { + *event = GF_EVENT_SOME_DESCENDENT_UP; + } + + priv->last_event[idx] = *event; +} + +void +__afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx, + int64_t child_latency_msec, int32_t *event, + int32_t *call_psh, int32_t *up_child) +{ + afr_private_t *priv = NULL; + int i = 0; + int up_children = 0; + int down_children = 0; + int best_down_child = -1; + + priv = this->private; + + /* + * If a brick is down when we start, we'll get a + * CHILD_DOWN to indicate its initial state. There + * was never a CHILD_UP in this case, so if we + * increment "down_count" the difference between than + * and "up_count" will no longer be the number of + * children that are currently up. This has serious + * implications e.g. for quorum enforcement, so we + * don't increment these values unless the event + * represents an actual state transition between "up" + * (value = 1) and anything else. + */ + if (priv->child_up[idx] == 1) { + priv->event_generation++; + } + + /* + * If this is an _actual_ CHILD_DOWN event, we + * want to set the child_latency to < 0 to indicate + * the child is really disconnected. + */ + if (child_latency_msec < 0) { + priv->child_latency[idx] = child_latency_msec; + priv->halo_child_up[idx] = 0; + } + priv->child_up[idx] = 0; + + up_children = __afr_get_up_children_count(priv); + /* + * Handle the edge case where we need to find the + * next best child (to mark up) as marking this child + * down would cause us to fall below halo_min_replicas. + * We will also force the SHD to heal this child _now_ + * as we want it to be up to date if we are going to + * begin using it synchronously. + */ + if (priv->halo_enabled && up_children < priv->halo_min_replicas) { + best_down_child = find_best_down_child(this); + if (best_down_child >= 0) { + gf_msg_debug(this->name, 0, + "Swapping out child %d for " + "child %d to satisfy halo_min_replicas (%d).", + idx, best_down_child, priv->halo_min_replicas); + priv->child_up[best_down_child] = 1; + *call_psh = 1; + *up_child = best_down_child; } - return 0; + } + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 0) + down_children++; + if (down_children == priv->child_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SUBVOLS_DOWN, + "All subvolumes are down. Going " + "offline until at least one of them " + "comes back up."); + gf_event(EVENT_AFR_SUBVOLS_DOWN, "client-pid=%d; subvol=%s", + this->ctx->cmd_args.client_pid, this->name); + } else { + *event = GF_EVENT_SOME_DESCENDENT_DOWN; + } + priv->last_event[idx] = *event; +} + +void +afr_ta_lock_release_synctask(xlator_t *this) +{ + call_frame_t *ta_frame = NULL; + int ret = 0; + + ta_frame = afr_ta_frame_create(this); + if (!ta_frame) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to create ta_frame"); + return; + } + + ret = synctask_new(this->ctx->env, afr_release_notify_lock_for_ta, + afr_ta_lock_release_done, ta_frame, this); + if (ret) { + STACK_DESTROY(ta_frame->root); + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to release " + "AFR_TA_DOM_NOTIFY lock."); + } +} + +static void +afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall) +{ + struct gf_upcall_inodelk_contention *lc = NULL; + unsigned int inmem_count = 0; + unsigned int onwire_count = 0; + afr_private_t *priv = this->private; + + lc = upcall->data; + + if (strcmp(lc->domain, AFR_TA_DOM_NOTIFY) != 0) + return; + + if (priv->shd.iamshd) { + /* shd should ignore AFR_TA_DOM_NOTIFY release requests. */ + return; + } + LOCK(&priv->lock); + { + if (priv->release_ta_notify_dom_lock == _gf_true) { + /* Ignore multiple release requests from shds.*/ + UNLOCK(&priv->lock); + return; + } + priv->release_ta_notify_dom_lock = _gf_true; + inmem_count = priv->ta_in_mem_txn_count; + onwire_count = priv->ta_on_wire_txn_count; + } + UNLOCK(&priv->lock); + if (inmem_count || onwire_count) + /* lock release will happen in txn code path after + * in-memory or on-wire txns are over.*/ + return; + + afr_ta_lock_release_synctask(this); +} + +static void +afr_handle_upcall_event(xlator_t *this, struct gf_upcall *upcall) +{ + struct gf_upcall_cache_invalidation *up_ci = NULL; + afr_private_t *priv = this->private; + inode_t *inode = NULL; + inode_table_t *itable = NULL; + int i = 0; + + switch (upcall->event_type) { + case GF_UPCALL_INODELK_CONTENTION: + afr_handle_inodelk_contention(this, upcall); + break; + case GF_UPCALL_CACHE_INVALIDATION: + up_ci = (struct gf_upcall_cache_invalidation *)upcall->data; + + /* Since md-cache will be aggressively filtering + * lookups, the stale read issue will be more + * pronounced. Hence when a pending xattr is set notify + * all the md-cache clients to invalidate the existing + * stat cache and send the lookup next time */ + if (!up_ci->dict) + break; + for (i = 0; i < priv->child_count; i++) { + if (!dict_get(up_ci->dict, priv->pending_key[i])) + continue; + up_ci->flags |= UP_INVAL_ATTR; + itable = ((xlator_t *)this->graph->top)->itable; + /*Internal processes may not have itable for + *top xlator*/ + if (itable) + inode = inode_find(itable, upcall->gfid); + if (inode) + afr_inode_need_refresh_set(inode, this); + break; + } + break; + default: + break; + } } int32_t -afr_statfs_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct statvfs *statvfs) +afr_notify(xlator_t *this, int32_t event, void *data, void *data2) { - afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t *child_xlator = NULL; + int i = -1; + int propagate = 0; + int had_heard_from_all = 0; + int have_heard_from_all = 0; + int idx = -1; + int ret = -1; + int call_psh = 0; + int up_child = -1; + dict_t *input = NULL; + dict_t *output = NULL; + gf_boolean_t had_quorum = _gf_false; + gf_boolean_t has_quorum = _gf_false; + int64_t halo_max_latency_msec = 0; + int64_t child_latency_msec = -1; + + child_xlator = (xlator_t *)data; + + priv = this->private; + + if (!priv) + return 0; - int call_count = 0; + /* + * We need to reset this in case children come up in "staggered" + * fashion, so that we discover a late-arriving local subvolume. Note + * that we could end up issuing N lookups to the first subvolume, and + * O(N^2) overall, but N is small for AFR so it shouldn't be an issue. + */ + priv->did_discovery = _gf_false; + + /* parent xlators don't need to know about every child_up, child_down + * because of afr ha. If all subvolumes go down, child_down has + * to be triggered. In that state when 1 subvolume comes up child_up + * needs to be triggered. dht optimizes revalidate lookup by sending + * it only to one of its subvolumes. When child up/down happens + * for afr's subvolumes dht should be notified by child_modified. The + * subsequent revalidate lookup happens on all the dht's subvolumes + * which triggers afr self-heals if any. + */ + idx = afr_find_child_index(this, child_xlator); + if (idx < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP, + "Received child_up from invalid subvolume"); + goto out; + } + + had_quorum = priv->quorum_count && + afr_has_quorum(priv->child_up, this, NULL); + if (event == GF_EVENT_CHILD_PING) { + child_latency_msec = (int64_t)(uintptr_t)data2; + if (priv->halo_enabled) { + halo_max_latency_msec = afr_get_halo_latency(this); + + /* Calculates the child latency and sets event + */ + LOCK(&priv->lock); + { + __afr_handle_ping_event(this, child_xlator, idx, + halo_max_latency_msec, &event, + child_latency_msec); + } + UNLOCK(&priv->lock); + } else { + LOCK(&priv->lock); + { + priv->child_latency[idx] = child_latency_msec; + } + UNLOCK(&priv->lock); + } + } - LOCK (&frame->lock); + if (event == GF_EVENT_CHILD_PING) { + /* This is the only xlator that handles PING, no reason to + * propagate. + */ + goto out; + } + + if (event == GF_EVENT_TRANSLATOR_OP) { + LOCK(&priv->lock); { - local = frame->local; - - if (op_ret == 0) { - local->op_ret = op_ret; - - if (local->cont.statfs.buf_set) { - if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) - local->cont.statfs.buf = *statvfs; - } else { - local->cont.statfs.buf = *statvfs; - local->cont.statfs.buf_set = 1; - } + had_heard_from_all = __get_heard_from_all_status(this); + } + UNLOCK(&priv->lock); + + if (!had_heard_from_all) { + ret = -1; + } else { + input = data; + output = data2; + ret = afr_xl_op(this, input, output); + } + goto out; + } + + if (event == GF_EVENT_UPCALL) { + afr_handle_upcall_event(this, data); + } + + LOCK(&priv->lock); + { + had_heard_from_all = __get_heard_from_all_status(this); + switch (event) { + case GF_EVENT_PARENT_UP: + __afr_launch_notify_timer(this, priv); + propagate = 1; + break; + case GF_EVENT_CHILD_UP: + if (priv->thin_arbiter_count && + (idx == AFR_CHILD_THIN_ARBITER)) { + priv->ta_child_up = 1; + priv->ta_event_gen++; + break; } + __afr_handle_child_up_event(this, child_xlator, idx, + child_latency_msec, &event, + &call_psh, &up_child); + __afr_lock_heal_synctask(this, priv, idx); + break; - if (op_ret == -1) - local->op_errno = op_errno; + case GF_EVENT_CHILD_DOWN: + if (priv->thin_arbiter_count && + (idx == AFR_CHILD_THIN_ARBITER)) { + priv->ta_child_up = 0; + priv->ta_event_gen++; + afr_ta_locked_priv_invalidate(priv); + break; + } + __afr_handle_child_down_event(this, child_xlator, idx, + child_latency_msec, &event, + &call_psh, &up_child); + __afr_mark_pending_lk_heal(this, priv, idx); + break; - } - UNLOCK (&frame->lock); + case GF_EVENT_CHILD_CONNECTING: + priv->last_event[idx] = event; - call_count = afr_frame_return (frame); + break; - if (call_count == 0) - AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, - &local->cont.statfs.buf); + case GF_EVENT_SOME_DESCENDENT_DOWN: + priv->last_event[idx] = event; + break; + default: + propagate = 1; + break; + } + have_heard_from_all = __get_heard_from_all_status(this); + if (!had_heard_from_all && have_heard_from_all) { + if (priv->timer) { + gf_timer_call_cancel(this->ctx, priv->timer); + priv->timer = NULL; + } + /* This is the first event which completes aggregation + of events from all subvolumes. If at least one subvol + had come up, propagate CHILD_UP, but only this time + */ + event = GF_EVENT_CHILD_DOWN; + for (i = 0; i < priv->child_count; i++) { + if (priv->last_event[i] == GF_EVENT_CHILD_UP) { + event = GF_EVENT_CHILD_UP; + break; + } - return 0; + if (priv->last_event[i] == GF_EVENT_CHILD_CONNECTING) { + event = GF_EVENT_CHILD_CONNECTING; + /* continue to check other events for CHILD_UP */ + } + } + } + } + UNLOCK(&priv->lock); + + if (priv->quorum_count) { + has_quorum = afr_has_quorum(priv->child_up, this, NULL); + if (!had_quorum && has_quorum) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_QUORUM_MET, + "Client-quorum is met"); + gf_event(EVENT_AFR_QUORUM_MET, "client-pid=%d; subvol=%s", + this->ctx->cmd_args.client_pid, this->name); + } + if (had_quorum && !has_quorum) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL, + "Client-quorum is not met"); + gf_event(EVENT_AFR_QUORUM_FAIL, "client-pid=%d; subvol=%s", + this->ctx->cmd_args.client_pid, this->name); + } + } + + /* if all subvols have reported status, no need to hide anything + or wait for anything else. Just propagate blindly */ + if (have_heard_from_all) + propagate = 1; + + ret = 0; + if (propagate) + ret = default_notify(this, event, data); + + if ((!had_heard_from_all) || call_psh) { + /* Launch self-heal on all local subvolumes if: + * a) We have_heard_from_all for the first time + * b) Already heard from everyone, but we now got a child-up + * event. + */ + if (have_heard_from_all) { + afr_selfheal_childup(this, priv); + } + } +out: + return ret; } +int +afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno) +{ + int __ret = -1; + local->op_ret = -1; + local->op_errno = EUCLEAN; + + __ret = syncbarrier_init(&local->barrier); + if (__ret) { + if (op_errno) + *op_errno = __ret; + goto out; + } + + local->child_up = GF_MALLOC(priv->child_count * sizeof(*local->child_up), + gf_afr_mt_char); + if (!local->child_up) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + memcpy(local->child_up, priv->child_up, + sizeof(*local->child_up) * priv->child_count); + local->call_count = AFR_COUNT(local->child_up, priv->child_count); + if (local->call_count == 0) { + gf_msg(THIS->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOLS_DOWN, + "no subvolumes up"); + if (op_errno) + *op_errno = ENOTCONN; + goto out; + } + + local->event_generation = priv->event_generation; + + local->read_attempted = GF_CALLOC(priv->child_count, sizeof(char), + gf_afr_mt_char); + if (!local->read_attempted) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->readable = GF_CALLOC(priv->child_count, sizeof(char), + gf_afr_mt_char); + if (!local->readable) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->readable2 = GF_CALLOC(priv->child_count, sizeof(char), + gf_afr_mt_char); + if (!local->readable2) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->read_subvol = -1; + + local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), + gf_afr_mt_reply_t); + if (!local->replies) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->need_full_crawl = _gf_false; + if (priv->thin_arbiter_count) { + local->ta_child_up = priv->ta_child_up; + local->ta_failed_subvol = AFR_CHILD_UNKNOWN; + local->read_txn_query_child = AFR_CHILD_UNKNOWN; + local->ta_event_gen = priv->ta_event_gen; + local->fop_state = TA_SUCCESS; + } + local->is_new_entry = _gf_false; + + INIT_LIST_HEAD(&local->healer); + return 0; +out: + return -1; +} -int32_t -afr_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc) +int +afr_internal_lock_init(afr_internal_lock_t *lk, size_t child_count) { - afr_private_t * priv = NULL; - int child_count = 0; - afr_local_t * local = NULL; - int i = 0; + int ret = -ENOMEM; - int ret = -1; - int call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; + lk->lower_locked_nodes = GF_CALLOC(sizeof(*lk->lower_locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->lower_locked_nodes) + goto out; - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); + lk->lock_op_ret = -1; + lk->lock_op_errno = EUCLEAN; - priv = this->private; - child_count = priv->child_count; + ret = 0; +out: + return ret; +} - ALLOC_OR_GOTO (local, afr_local_t, out); +void +afr_matrix_cleanup(int32_t **matrix, unsigned int m) +{ + int i = 0; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + if (!matrix) + goto out; + for (i = 0; i < m; i++) { + GF_FREE(matrix[i]); + } - frame->local = local; - call_count = local->call_count; + GF_FREE(matrix); +out: + return; +} - for (i = 0; i < child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_statfs_cbk, - priv->children[i], - priv->children[i]->fops->statfs, - loc); - if (!--call_count) - break; - } - } +int32_t ** +afr_matrix_create(unsigned int m, unsigned int n) +{ + int32_t **matrix = NULL; + int i = 0; + + matrix = GF_CALLOC(sizeof(*matrix), m, gf_afr_mt_int32_t); + if (!matrix) + goto out; + + for (i = 0; i < m; i++) { + matrix[i] = GF_CALLOC(sizeof(*matrix[i]), n, gf_afr_mt_int32_t); + if (!matrix[i]) + goto out; + } + return matrix; +out: + afr_matrix_cleanup(matrix, m); + return NULL; +} - op_ret = 0; +int +afr_transaction_local_init(afr_local_t *local, xlator_t *this) +{ + int ret = -ENOMEM; + afr_private_t *priv = NULL; + + priv = this->private; + INIT_LIST_HEAD(&local->transaction.wait_list); + INIT_LIST_HEAD(&local->transaction.owner_list); + INIT_LIST_HEAD(&local->ta_waitq); + INIT_LIST_HEAD(&local->ta_onwireq); + ret = afr_internal_lock_init(&local->internal_lock, priv->child_count); + if (ret < 0) + goto out; + + ret = -ENOMEM; + local->pre_op_compat = priv->pre_op_compat; + + local->transaction.pre_op = GF_CALLOC(sizeof(*local->transaction.pre_op), + priv->child_count, gf_afr_mt_char); + if (!local->transaction.pre_op) + goto out; + + local->transaction.changelog_xdata = GF_CALLOC( + sizeof(*local->transaction.changelog_xdata), priv->child_count, + gf_afr_mt_dict_t); + if (!local->transaction.changelog_xdata) + goto out; + + if (priv->arbiter_count == 1) { + local->transaction.pre_op_sources = GF_CALLOC( + sizeof(*local->transaction.pre_op_sources), priv->child_count, + gf_afr_mt_char); + if (!local->transaction.pre_op_sources) + goto out; + } + + local->transaction.failed_subvols = GF_CALLOC( + sizeof(*local->transaction.failed_subvols), priv->child_count, + gf_afr_mt_char); + if (!local->transaction.failed_subvols) + goto out; + + local->pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!local->pending) + goto out; + + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (statfs, frame, op_ret, op_errno, NULL); - } - return 0; + return ret; } +void +afr_set_low_priority(call_frame_t *frame) +{ + frame->root->pid = LOW_PRIO_PROC_PID; +} -int32_t -afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) +void +afr_priv_destroy(afr_private_t *priv) { - afr_local_t * local = NULL; + int i = 0; + int child_count = -1; - int call_count = -1; + if (!priv) + goto out; - local = frame->local; - call_count = afr_frame_return (frame); + GF_FREE(priv->sh_domain); + GF_FREE(priv->last_event); - if (call_count == 0) - AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - lock); + child_count = priv->child_count; + if (priv->thin_arbiter_count) { + child_count++; + } + if (priv->pending_key) { + for (i = 0; i < child_count; i++) + GF_FREE(priv->pending_key[i]); + } + + GF_FREE(priv->pending_reads); + GF_FREE(priv->local); + GF_FREE(priv->pending_key); + GF_FREE(priv->children); + GF_FREE(priv->anon_inode); + GF_FREE(priv->child_up); + GF_FREE(priv->halo_child_up); + GF_FREE(priv->child_latency); + LOCK_DESTROY(&priv->lock); + + GF_FREE(priv); +out: + return; +} - return 0; +int ** +afr_mark_pending_changelog(afr_private_t *priv, unsigned char *pending, + dict_t *xattr, ia_type_t iat) +{ + int i = 0; + int **changelog = NULL; + int idx = -1; + int m_idx = 0; + int d_idx = 0; + int ret = 0; + + m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION); + d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION); + + idx = afr_index_from_ia_type(iat); + + changelog = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!changelog) + goto out; + + for (i = 0; i < priv->child_count; i++) { + if (!pending[i]) + continue; + + changelog[i][m_idx] = hton32(1); + if (idx != -1) + changelog[i][idx] = hton32(1); + /* If the newentry marking is on a newly created directory, + * then mark it with the full-heal indicator. + */ + if ((IA_ISDIR(iat)) && (priv->esh_granular)) + changelog[i][d_idx] = hton32(1); + } + ret = afr_set_pending_dict(priv, xattr, changelog); + if (ret < 0) { + afr_matrix_cleanup(changelog, priv->child_count); + return NULL; + } +out: + return changelog; } +static dict_t * +afr_set_heal_info(char *status) +{ + dict_t *dict = NULL; + int ret = -1; + + dict = dict_new(); + if (!dict) { + ret = -ENOMEM; + goto out; + } + + ret = dict_set_dynstr_sizen(dict, "heal-info", status); + if (ret) + gf_msg("", GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "Failed to set heal-info key to " + "%s", + status); +out: + /* Any error other than EINVAL, dict_set_dynstr frees status */ + if (ret == -ENOMEM || ret == -EINVAL) { + GF_FREE(status); + } + + if (ret && dict) { + dict_unref(dict); + dict = NULL; + } + return dict; +} -int32_t -afr_lk_unlock (call_frame_t *frame, xlator_t *this) +static gf_boolean_t +afr_is_dirty_count_non_unary_for_txn(xlator_t *this, struct afr_reply *replies, + afr_transaction_type type) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + afr_private_t *priv = this->private; + int *dirty = alloca0(priv->child_count * sizeof(int)); + int i = 0; - int i; - int call_count = 0; + afr_selfheal_extract_xattr(this, replies, type, dirty, NULL); + for (i = 0; i < priv->child_count; i++) { + if (dirty[i] > 1) + return _gf_true; + } - local = frame->local; - priv = this->private; + return _gf_false; +} - call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes, - priv->child_count); +static gf_boolean_t +afr_is_dirty_count_non_unary(xlator_t *this, struct afr_reply *replies, + ia_type_t ia_type) +{ + gf_boolean_t data_chk = _gf_false; + gf_boolean_t mdata_chk = _gf_false; + gf_boolean_t entry_chk = _gf_false; + + switch (ia_type) { + case IA_IFDIR: + mdata_chk = _gf_true; + entry_chk = _gf_true; + break; + case IA_IFREG: + mdata_chk = _gf_true; + data_chk = _gf_true; + break; + default: + /*IA_IFBLK, IA_IFCHR, IA_IFLNK, IA_IFIFO, IA_IFSOCK*/ + mdata_chk = _gf_true; + break; + } + + if (data_chk && afr_is_dirty_count_non_unary_for_txn( + this, replies, AFR_DATA_TRANSACTION)) { + return _gf_true; + } else if (mdata_chk && afr_is_dirty_count_non_unary_for_txn( + this, replies, AFR_METADATA_TRANSACTION)) { + return _gf_true; + } else if (entry_chk && afr_is_dirty_count_non_unary_for_txn( + this, replies, AFR_ENTRY_TRANSACTION)) { + return _gf_true; + } + + return _gf_false; +} - if (call_count == 0) { - AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - &local->cont.lk.ret_flock); - return 0; +static int +afr_update_heal_status(xlator_t *this, struct afr_reply *replies, + ia_type_t ia_type, gf_boolean_t *esh, gf_boolean_t *dsh, + gf_boolean_t *msh, unsigned char pending) +{ + int ret = -1; + GF_UNUSED int ret1 = 0; + int i = 0; + int io_domain_lk_count = 0; + int shd_domain_lk_count = 0; + afr_private_t *priv = NULL; + char *key1 = NULL; + char *key2 = NULL; + + priv = this->private; + key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(this->name)); + key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(priv->sh_domain)); + sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name); + sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain); + + for (i = 0; i < priv->child_count; i++) { + if ((replies[i].valid != 1) || (replies[i].op_ret != 0)) + continue; + if (!io_domain_lk_count) { + ret1 = dict_get_int32(replies[i].xdata, key1, &io_domain_lk_count); + } + if (!shd_domain_lk_count) { + ret1 = dict_get_int32(replies[i].xdata, key2, &shd_domain_lk_count); } + } - local->call_count = call_count; + if (!pending) { + if ((afr_is_dirty_count_non_unary(this, replies, ia_type)) || + (!io_domain_lk_count)) { + /* Needs heal. */ + ret = 0; + } else { + /* No heal needed. */ + *dsh = *esh = *msh = 0; + } + } else { + if (shd_domain_lk_count) { + ret = -EAGAIN; /*For 'possibly-healing'. */ + } else { + ret = 0; /*needs heal. Just set a non -ve value so that it is + assumed as the source index.*/ + } + } + return ret; +} - local->cont.lk.user_flock.l_type = F_UNLCK; +/*return EIO, EAGAIN or pending*/ +int +afr_lockless_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, + inode_t **inode, gf_boolean_t *entry_selfheal, + gf_boolean_t *data_selfheal, + gf_boolean_t *metadata_selfheal, unsigned char *pending) +{ + int ret = -1; + int i = 0; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + gf_boolean_t dsh = _gf_false; + gf_boolean_t msh = _gf_false; + gf_boolean_t esh = _gf_false; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *valid_on = NULL; + uint64_t *witness = NULL; + + priv = this->private; + replies = alloca0(sizeof(*replies) * priv->child_count); + sources = alloca0(sizeof(*sources) * priv->child_count); + sinks = alloca0(sizeof(*sinks) * priv->child_count); + witness = alloca0(sizeof(*witness) * priv->child_count); + valid_on = alloca0(sizeof(*valid_on) * priv->child_count); + + ret = afr_selfheal_unlocked_inspect(frame, this, gfid, inode, &dsh, &msh, + &esh, replies); + if (ret) + goto out; + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid && replies[i].op_ret == 0) { + valid_on[i] = 1; + } + } + if (msh) { + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_METADATA_TRANSACTION, valid_on, + sources, sinks, witness, pending); + if (*pending & PFLAG_SBRAIN) + ret = -EIO; + if (ret) + goto out; + } + if (dsh) { + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_DATA_TRANSACTION, valid_on, + sources, sinks, witness, pending); + if (*pending & PFLAG_SBRAIN) + ret = -EIO; + if (ret) + goto out; + } + if (esh) { + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_ENTRY_TRANSACTION, valid_on, + sources, sinks, witness, pending); + if (*pending & PFLAG_SBRAIN) + ret = -EIO; + if (ret) + goto out; + } - for (i = 0; i < priv->child_count; i++) { - if (local->cont.lk.locked_nodes[i]) { - STACK_WIND (frame, afr_lk_unlock_cbk, - priv->children[i], - priv->children[i]->fops->lk, - local->fd, F_SETLK, - &local->cont.lk.user_flock); - - if (!--call_count) - break; - } + ret = afr_update_heal_status(this, replies, (*inode)->ia_type, &esh, &dsh, + &msh, *pending); +out: + *data_selfheal = dsh; + *entry_selfheal = esh; + *metadata_selfheal = msh; + if (replies) + afr_replies_wipe(replies, priv->child_count); + return ret; +} + +int +afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + gf_boolean_t data_selfheal = _gf_false; + gf_boolean_t metadata_selfheal = _gf_false; + gf_boolean_t entry_selfheal = _gf_false; + unsigned char pending = 0; + dict_t *dict = NULL; + int ret = -1; + int op_errno = ENOMEM; + inode_t *inode = NULL; + char *substr = NULL; + char *status = NULL; + call_frame_t *heal_frame = NULL; + afr_local_t *heal_local = NULL; + + /*Use frame with lk-owner set*/ + heal_frame = afr_frame_create(frame->this, &op_errno); + if (!heal_frame) { + ret = -1; + goto out; + } + heal_local = heal_frame->local; + heal_frame->local = frame->local; + + ret = afr_lockless_inspect(heal_frame, this, loc->gfid, &inode, + &entry_selfheal, &data_selfheal, + &metadata_selfheal, &pending); + + if (ret == -ENOMEM) { + ret = -1; + goto out; + } + + if (pending & PFLAG_PENDING) { + gf_asprintf(&substr, "-pending"); + if (!substr) + goto out; + } + + if (ret == -EIO) { + ret = gf_asprintf(&status, "split-brain%s", substr ? substr : ""); + if (ret < 0) { + goto out; + } + dict = afr_set_heal_info(status); + if (!dict) { + ret = -1; + goto out; + } + } else if (ret == -EAGAIN) { + ret = gf_asprintf(&status, "possibly-healing%s", substr ? substr : ""); + if (ret < 0) { + goto out; + } + dict = afr_set_heal_info(status); + if (!dict) { + ret = -1; + goto out; + } + } else if (ret >= 0) { + /* value of ret = source index + * so ret >= 0 and at least one of the 3 booleans set to + * true means a source is identified; heal is required. + */ + if (!data_selfheal && !entry_selfheal && !metadata_selfheal) { + status = gf_strdup("no-heal"); + if (!status) { + ret = -1; + goto out; + } + dict = afr_set_heal_info(status); + if (!dict) { + ret = -1; + goto out; + } + } else { + ret = gf_asprintf(&status, "heal%s", substr ? substr : ""); + if (ret < 0) { + goto out; + } + dict = afr_set_heal_info(status); + if (!dict) { + ret = -1; + goto out; + } + } + } else if (ret < 0) { + /* Apart from above checked -ve ret values, there are + * other possible ret values like ENOTCONN + * (returned when number of valid replies received are + * less than 2) + * in which case heal is required when one of the + * selfheal booleans is set. + */ + if (data_selfheal || entry_selfheal || metadata_selfheal) { + ret = gf_asprintf(&status, "heal%s", substr ? substr : ""); + if (ret < 0) { + goto out; + } + dict = afr_set_heal_info(status); + if (!dict) { + ret = -1; + goto out; + } } + } - return 0; + ret = 0; + op_errno = 0; + +out: + if (heal_frame) { + heal_frame->local = heal_local; + AFR_STACK_DESTROY(heal_frame); + } + AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL); + if (dict) + dict_unref(dict); + if (inode) + inode_unref(inode); + GF_FREE(substr); + return ret; } +int +_afr_is_split_brain(call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, afr_transaction_type type, + gf_boolean_t *spb) +{ + afr_private_t *priv = NULL; + uint64_t *witness = NULL; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + int sources_count = 0; + int ret = 0; + + priv = this->private; + + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + witness = alloca0(priv->child_count * sizeof(*witness)); + + ret = afr_selfheal_find_direction(frame, this, replies, type, + priv->child_up, sources, sinks, witness, + NULL); + if (ret) + return ret; + + sources_count = AFR_COUNT(sources, priv->child_count); + if (!sources_count) + *spb = _gf_true; -int32_t -afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + return ret; +} + +int +afr_is_split_brain(call_frame_t *frame, xlator_t *this, inode_t *inode, + uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; -/* int ret = 0; */ + int ret = -1; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; - int child_index = -1; + priv = this->private; - local = frame->local; - priv = this->private; + replies = alloca0(sizeof(*replies) * priv->child_count); - child_index = (long) cookie; + ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies); + if (ret) + goto out; - if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) { - local->op_ret = -1; - local->op_errno = op_errno; + if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) { + ret = -EAGAIN; + goto out; + } - afr_lk_unlock (frame, this); - return 0; + ret = _afr_is_split_brain(frame, this, replies, AFR_DATA_TRANSACTION, + d_spb); + if (ret) + goto out; + + ret = _afr_is_split_brain(frame, this, replies, AFR_METADATA_TRANSACTION, + m_spb); +out: + if (replies) { + afr_replies_wipe(replies, priv->child_count); + replies = NULL; + } + return ret; +} + +int +afr_get_split_brain_status_cbk(int ret, call_frame_t *frame, void *opaque) +{ + GF_FREE(opaque); + return 0; +} + +int +afr_get_split_brain_status(void *opaque) +{ + gf_boolean_t d_spb = _gf_false; + gf_boolean_t m_spb = _gf_false; + int ret = -1; + int op_errno = 0; + int i = 0; + char *choices = NULL; + char *status = NULL; + dict_t *dict = NULL; + inode_t *inode = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + loc_t *loc = NULL; + afr_spb_status_t *data = NULL; + + data = opaque; + frame = data->frame; + this = frame->this; + loc = data->loc; + priv = this->private; + children = priv->children; + + inode = afr_inode_find(this, loc->gfid); + if (!inode) + goto out; + + dict = dict_new(); + if (!dict) { + op_errno = ENOMEM; + ret = -1; + goto out; + } + + /* Calculation for string length : + * (child_count X length of child-name) + SLEN(" Choices :") + * child-name consists of : + * a) 251 = max characters for volname according to GD_VOLUME_NAME_MAX + * b) strlen("-client-00,") assuming 16 replicas + */ + choices = alloca0(priv->child_count * (256 + SLEN("-client-00,")) + + SLEN(" Choices:")); + + ret = afr_is_split_brain(frame, this, inode, loc->gfid, &d_spb, &m_spb); + if (ret) { + op_errno = -ret; + if (ret == -EAGAIN) { + ret = dict_set_sizen_str_sizen(dict, GF_AFR_SBRAIN_STATUS, + SBRAIN_HEAL_NO_GO_MSG); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, + AFR_MSG_DICT_SET_FAILED, + "Failed to set GF_AFR_SBRAIN_STATUS in dict"); + } } + ret = -1; + goto out; + } - if (op_ret == 0) { - local->op_ret = 0; - local->op_errno = 0; - local->cont.lk.locked_nodes[child_index] = 1; - local->cont.lk.ret_flock = *lock; + if (d_spb || m_spb) { + sprintf(choices, " Choices:"); + for (i = 0; i < priv->child_count; i++) { + strcat(choices, children[i]->name); + strcat(choices, ","); } + choices[strlen(choices) - 1] = '\0'; - child_index++; + ret = gf_asprintf(&status, + "data-split-brain:%s " + "metadata-split-brain:%s%s", + (d_spb) ? "yes" : "no", (m_spb) ? "yes" : "no", + choices); - if (child_index < priv->child_count) { - STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->lk, - local->fd, local->cont.lk.cmd, - &local->cont.lk.user_flock); - } else if (local->op_ret == -1) { - /* all nodes have gone down */ + if (-1 == ret) { + op_errno = ENOMEM; + goto out; + } + ret = dict_set_dynstr_sizen(dict, GF_AFR_SBRAIN_STATUS, status); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + } else { + ret = dict_set_sizen_str_sizen(dict, GF_AFR_SBRAIN_STATUS, + SFILE_NOT_UNDER_DATA); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + } - AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN, - &local->cont.lk.ret_flock); - } else { - /* locking has succeeded on all nodes that are up */ + ret = 0; +out: + AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL); + if (dict) + dict_unref(dict); + if (inode) + inode_unref(inode); + return ret; +} - /* temporarily - ret = afr_mark_locked_nodes (this, local->fd, - local->cont.lk.locked_nodes); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not save locked nodes info in fdctx"); +int32_t +afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + int ret = 0; + int op_errno = 0; + dict_t *dict = NULL; + afr_local_t *local = NULL; + afr_local_t *heal_local = NULL; + call_frame_t *heal_frame = NULL; + + local = frame->local; + dict = dict_new(); + if (!dict) { + op_errno = ENOMEM; + ret = -1; + goto out; + } + + heal_frame = afr_frame_create(this, &op_errno); + if (!heal_frame) { + ret = -1; + goto out; + } + heal_local = heal_frame->local; + heal_frame->local = frame->local; + /*Initiate heal with heal_frame with lk-owner set so that inodelk/entrylk + * work correctly*/ + ret = afr_selfheal_do(heal_frame, this, loc->gfid); + + if (ret == 1 || ret == 2) { + ret = dict_set_sizen_str_sizen(dict, "sh-fail-msg", + SFILE_NOT_IN_SPLIT_BRAIN); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "Failed to set sh-fail-msg in dict"); + ret = 0; + goto out; + } else { + if (local->xdata_rsp) { + /* 'sh-fail-msg' has been set in the dict during self-heal.*/ + dict_copy(local->xdata_rsp, dict); + ret = 0; + } else if (ret < 0) { + op_errno = -ret; + ret = -1; + } + } - ret = afr_save_locked_fd (this, local->fd); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not save locked fd"); +out: + if (heal_frame) { + heal_frame->local = heal_local; + AFR_STACK_DESTROY(heal_frame); + } + if (local->op == GF_FOP_GETXATTR) + AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL); + else if (local->op == GF_FOP_SETXATTR) + AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL); + if (dict) + dict_unref(dict); + return ret; +} - */ - AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - &local->cont.lk.ret_flock); - } +int +afr_get_child_index_from_name(xlator_t *this, char *name) +{ + afr_private_t *priv = this->private; + int index = -1; + + for (index = 0; index < priv->child_count; index++) { + if (!strcmp(priv->children[index]->name, name)) + goto out; + } + index = -1; +out: + return index; +} - return 0; +void +afr_priv_need_heal_set(afr_private_t *priv, gf_boolean_t need_heal) +{ + LOCK(&priv->lock); + { + priv->need_heal = need_heal; + } + UNLOCK(&priv->lock); } +void +afr_set_need_heal(xlator_t *this, afr_local_t *local) +{ + int i = 0; + afr_private_t *priv = this->private; + gf_boolean_t need_heal = _gf_false; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].need_heal) { + need_heal = _gf_true; + break; + } + } + afr_priv_need_heal_set(priv, need_heal); + return; +} + +gf_boolean_t +afr_get_need_heal(xlator_t *this) +{ + afr_private_t *priv = this->private; + gf_boolean_t need_heal = _gf_true; + + LOCK(&priv->lock); + { + need_heal = priv->need_heal; + } + UNLOCK(&priv->lock); + return need_heal; +} int -afr_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t cmd, - struct gf_flock *flock) +afr_get_msg_id(char *op_type) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; + if (!strcmp(op_type, GF_AFR_REPLACE_BRICK)) + return AFR_MSG_REPLACE_BRICK_STATUS; + else if (!strcmp(op_type, GF_AFR_ADD_BRICK)) + return AFR_MSG_ADD_BRICK_STATUS; + return -1; +} - int i = 0; +int +afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *heal_frame, + void *opaque) +{ + call_frame_t *txn_frame = NULL; + afr_local_t *local = NULL; + afr_local_t *heal_local = NULL; + xlator_t *this = NULL; - int32_t op_ret = -1; - int32_t op_errno = 0; + heal_local = heal_frame->local; + txn_frame = heal_local->heal_frame; + local = txn_frame->local; + this = txn_frame->this; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + /* Refresh the inode agan and proceed with the transaction.*/ + afr_inode_refresh(txn_frame, this, local->inode, NULL, local->refreshfn); - priv = this->private; + AFR_STACK_DESTROY(heal_frame); - ALLOC_OR_GOTO (local, afr_local_t, out); - AFR_LOCAL_INIT (local, priv); + return 0; +} - frame->local = local; +int +afr_fav_child_reset_sink_xattrs(void *opaque) +{ + call_frame_t *heal_frame = NULL; + call_frame_t *txn_frame = NULL; + xlator_t *this = NULL; + gf_boolean_t d_spb = _gf_false; + gf_boolean_t m_spb = _gf_false; + afr_local_t *heal_local = NULL; + afr_local_t *txn_local = NULL; + afr_private_t *priv = NULL; + inode_t *inode = NULL; + unsigned char *locked_on = NULL; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *undid_pending = NULL; + struct afr_reply *locked_replies = NULL; + int ret = 0; + + heal_frame = (call_frame_t *)opaque; + heal_local = heal_frame->local; + txn_frame = heal_local->heal_frame; + txn_local = txn_frame->local; + this = txn_frame->this; + inode = txn_local->inode; + priv = this->private; + locked_on = alloca0(priv->child_count); + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + healed_sinks = alloca0(priv->child_count); + undid_pending = alloca0(priv->child_count); + locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count); + + ret = _afr_is_split_brain(txn_frame, this, txn_local->replies, + AFR_DATA_TRANSACTION, &d_spb); + + ret = _afr_is_split_brain(txn_frame, this, txn_local->replies, + AFR_METADATA_TRANSACTION, &m_spb); + + /* Take appropriate locks and reset sink xattrs. */ + if (d_spb) { + ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, 0, 0, + locked_on); + { + if (ret < priv->child_count) + goto data_unlock; + ret = __afr_selfheal_data_prepare( + heal_frame, this, inode, locked_on, sources, sinks, + healed_sinks, undid_pending, locked_replies, NULL); + } + data_unlock: + afr_selfheal_uninodelk(heal_frame, this, inode, this->name, 0, 0, + locked_on); + } + + if (m_spb) { + memset(locked_on, 0, sizeof(*locked_on) * priv->child_count); + memset(undid_pending, 0, sizeof(*undid_pending) * priv->child_count); + ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, + LLONG_MAX - 1, 0, locked_on); + { + if (ret < priv->child_count) + goto mdata_unlock; + ret = __afr_selfheal_metadata_prepare( + heal_frame, this, inode, locked_on, sources, sinks, + healed_sinks, undid_pending, locked_replies, NULL); + } + mdata_unlock: + afr_selfheal_uninodelk(heal_frame, this, inode, this->name, + LLONG_MAX - 1, 0, locked_on); + } - local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count, - sizeof (*local->cont.lk.locked_nodes), - gf_afr_mt_char); + return ret; +} - if (!local->cont.lk.locked_nodes) { - gf_log (this->name, GF_LOG_ERROR, "Out of memory"); - op_errno = ENOMEM; +/* + * Concatenates the xattrs in local->replies separated by a delimiter. + */ +int +afr_serialize_xattrs_with_delimiter(call_frame_t *frame, xlator_t *this, + char *buf, const char *default_str, + int32_t *serz_len, char delimiter) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + char *xattr = NULL; + int i = 0; + int len = 0; + int keylen = 0; + size_t str_len = 0; + int ret = -1; + + priv = this->private; + local = frame->local; + + keylen = strlen(local->cont.getxattr.name); + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid || local->replies[i].op_ret) { + str_len = strlen(default_str); + buf = strncat(buf, default_str, str_len); + len += str_len; + buf[len++] = delimiter; + buf[len] = '\0'; + } else { + ret = dict_get_strn(local->replies[i].xattr, + local->cont.getxattr.name, keylen, &xattr); + if (ret) { + gf_msg("TEST", GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "Failed to get the node_uuid of brick " + "%d", + i); goto out; + } + str_len = strlen(xattr); + buf = strncat(buf, xattr, str_len); + len += str_len; + buf[len++] = delimiter; + buf[len] = '\0'; } + } + buf[--len] = '\0'; /*remove the last delimiter*/ + if (serz_len) + *serz_len = ++len; + ret = 0; - local->fd = fd_ref (fd); - local->cont.lk.cmd = cmd; - local->cont.lk.user_flock = *flock; - local->cont.lk.ret_flock = *flock; +out: + return ret; +} - STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, - priv->children[i], - priv->children[i]->fops->lk, - fd, cmd, flock); +uint64_t +afr_write_subvol_get(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + uint64_t write_subvol = 0; - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (lk, frame, op_ret, op_errno, NULL); + local = frame->local; + LOCK(&local->inode->lock); + write_subvol = local->inode_ctx->write_subvol; + UNLOCK(&local->inode->lock); + + return write_subvol; +} + +int +afr_write_subvol_set(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *data_accused = NULL; + unsigned char *metadata_accused = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint64_t val = 0; + int event = 0; + int i = 0; + + local = frame->local; + priv = this->private; + data_accused = alloca0(priv->child_count); + metadata_accused = alloca0(priv->child_count); + data_readable = alloca0(priv->child_count); + metadata_readable = alloca0(priv->child_count); + event = local->event_generation; + + afr_readables_fill(frame, this, local->inode, data_accused, + metadata_accused, data_readable, metadata_readable, + NULL); + + for (i = 0; i < priv->child_count; i++) { + if (data_readable[i]) + datamap |= (1 << i); + if (metadata_readable[i]) + metadatamap |= (1 << i); + } + + val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) | + (((uint64_t)event) << 32); + + LOCK(&local->inode->lock); + { + if (local->inode_ctx->write_subvol == 0 && + local->transaction.type == AFR_DATA_TRANSACTION) { + local->inode_ctx->write_subvol = val; } - return 0; + } + UNLOCK(&local->inode->lock); + + return 0; } int -afr_priv_dump (xlator_t *this) +afr_write_subvol_reset(call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 0; + afr_local_t *local = NULL; + local = frame->local; + LOCK(&local->inode->lock); + { + GF_ASSERT(local->inode_ctx->lock_count > 0); + local->inode_ctx->lock_count--; - GF_ASSERT (this); - priv = this->private; + if (!local->inode_ctx->lock_count) + local->inode_ctx->write_subvol = 0; + } + UNLOCK(&local->inode->lock); - GF_ASSERT (priv); - snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); - gf_proc_dump_add_section(key_prefix); - gf_proc_dump_build_key(key, key_prefix, "child_count"); - gf_proc_dump_write(key, "%u", priv->child_count); - gf_proc_dump_build_key(key, key_prefix, "read_child_rr"); - gf_proc_dump_write(key, "%u", priv->read_child_rr); - for (i = 0; i < priv->child_count; i++) { - gf_proc_dump_build_key(key, key_prefix, "child_up[%d]", i); - gf_proc_dump_write(key, "%d", priv->child_up[i]); - gf_proc_dump_build_key(key, key_prefix, - "pending_key[%d]", i); - gf_proc_dump_write(key, "%s", priv->pending_key[i]); - } - gf_proc_dump_build_key(key, key_prefix, "data_self_heal"); - gf_proc_dump_write(key, "%d", priv->data_self_heal); - gf_proc_dump_build_key(key, key_prefix, "metadata_self_heal"); - gf_proc_dump_write(key, "%d", priv->metadata_self_heal); - gf_proc_dump_build_key(key, key_prefix, "entry_self_heal"); - gf_proc_dump_write(key, "%d", priv->entry_self_heal); - gf_proc_dump_build_key(key, key_prefix, "data_change_log"); - gf_proc_dump_write(key, "%d", priv->data_change_log); - gf_proc_dump_build_key(key, key_prefix, "metadata_change_log"); - gf_proc_dump_write(key, "%d", priv->metadata_change_log); - gf_proc_dump_build_key(key, key_prefix, "entry_change_log"); - gf_proc_dump_write(key, "%d", priv->entry_change_log); - gf_proc_dump_build_key(key, key_prefix, "read_child"); - gf_proc_dump_write(key, "%d", priv->read_child); - gf_proc_dump_build_key(key, key_prefix, "favorite_child"); - gf_proc_dump_write(key, "%u", priv->favorite_child); - gf_proc_dump_build_key(key, key_prefix, "data_lock_server_count"); - gf_proc_dump_write(key, "%u", priv->data_lock_server_count); - gf_proc_dump_build_key(key, key_prefix, "metadata_lock_server_count"); - gf_proc_dump_write(key, "%u", priv->metadata_lock_server_count); - gf_proc_dump_build_key(key, key_prefix, "entry_lock_server_count"); - gf_proc_dump_write(key, "%u", priv->entry_lock_server_count); - gf_proc_dump_build_key(key, key_prefix, "wait_count"); - gf_proc_dump_write(key, "%u", priv->wait_count); + return 0; +} - return 0; +int +afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode) +{ + int ret = 0; + + local->inode = inode_ref(inode); + LOCK(&local->inode->lock); + { + ret = __afr_inode_ctx_get(this, local->inode, &local->inode_ctx); + } + UNLOCK(&local->inode->lock); + if (ret < 0) { + gf_msg_callingfn( + this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_INODE_CTX_GET_FAILED, + "Error getting inode ctx %s", uuid_utoa(local->inode->gfid)); + } + return ret; } +gf_boolean_t +afr_ta_is_fop_called_from_synctask(xlator_t *this) +{ + struct synctask *task = NULL; + gf_lkowner_t tmp_owner = { + 0, + }; -/** - * find_child_index - find the child's index in the array of subvolumes - * @this: AFR - * @child: child - */ + task = synctask_get(); + if (!task) + return _gf_false; -static int -find_child_index (xlator_t *this, xlator_t *child) -{ - afr_private_t *priv = NULL; + set_lk_owner_from_ptr(&tmp_owner, (void *)this); - int i = -1; + if (!is_same_lkowner(&tmp_owner, &task->frame->root->lk_owner)) + return _gf_false; - priv = this->private; + return _gf_true; +} - for (i = 0; i < priv->child_count; i++) { - if ((xlator_t *) child == priv->children[i]) - break; +int +afr_ta_post_op_lock(xlator_t *this, loc_t *loc) +{ + int ret = 0; + uuid_t gfid = { + 0, + }; + afr_private_t *priv = this->private; + gf_boolean_t locked = _gf_false; + struct gf_flock flock1 = { + 0, + }; + struct gf_flock flock2 = { + 0, + }; + int32_t cmd = 0; + + /* Clients must take AFR_TA_DOM_NOTIFY lock only when the previous lock + * has been released in afr_notify due to upcall notification from shd. + */ + GF_ASSERT(priv->ta_notify_dom_lock_offset == 0); + + if (!priv->shd.iamshd) + GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); + flock1.l_type = F_WRLCK; + + while (!locked) { + if (priv->shd.iamshd) { + cmd = F_SETLKW; + flock1.l_start = 0; + flock1.l_len = 0; + } else { + cmd = F_SETLK; + gf_uuid_generate(gfid); + flock1.l_start = gfid_to_ino(gfid); + if (flock1.l_start < 0) + flock1.l_start = -flock1.l_start; + flock1.l_len = 1; + } + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_NOTIFY, loc, cmd, &flock1, NULL, NULL); + if (!ret) { + locked = _gf_true; + priv->ta_notify_dom_lock_offset = flock1.l_start; + } else if (ret == -EAGAIN) { + continue; + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to get " + "AFR_TA_DOM_NOTIFY lock on %s.", + loc->name); + goto out; } + } + + flock2.l_type = F_WRLCK; + flock2.l_start = 0; + flock2.l_len = 0; + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_MODIFY, loc, F_SETLKW, &flock2, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to get AFR_TA_DOM_MODIFY lock on %s.", loc->name); + flock1.l_type = F_UNLCK; + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock1, NULL, + NULL); + } +out: + return ret; +} - return i; +int +afr_ta_post_op_unlock(xlator_t *this, loc_t *loc) +{ + afr_private_t *priv = this->private; + struct gf_flock flock = { + 0, + }; + int ret = 0; + + if (!priv->shd.iamshd) + GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); + flock.l_type = F_UNLCK; + flock.l_start = 0; + flock.l_len = 0; + + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_MODIFY, loc, F_SETLK, &flock, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to unlock AFR_TA_DOM_MODIFY lock."); + goto out; + } + + if (!priv->shd.iamshd) + /* Mounts (clients) will not release the AFR_TA_DOM_NOTIFY lock + * in post-op as they use it as a notification mechanism. When + * shd sends a lock request on TA during heal, the clients will + * receive a lock-contention upcall notification upon which they + * will release the AFR_TA_DOM_NOTIFY lock after completing the + * in flight I/O.*/ + goto out; + + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to unlock AFR_TA_DOM_NOTIFY lock."); + } +out: + return ret; } -int32_t -afr_notify (xlator_t *this, int32_t event, - void *data, ...) +call_frame_t * +afr_ta_frame_create(xlator_t *this) { - afr_private_t * priv = NULL; - unsigned char * child_up = NULL; + call_frame_t *frame = NULL; + void *lk_owner = NULL; + + frame = create_frame(this, this->ctx->pool); + if (!frame) + return NULL; + lk_owner = (void *)this; + afr_set_lk_owner(frame, this, lk_owner); + return frame; +} - int i = -1; - int up_children = 0; - int down_children = 0; +gf_boolean_t +afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local) +{ + int data_count = 0; - priv = this->private; + data_count = AFR_COUNT(local->child_up, priv->child_count); + if (data_count == 2) { + return _gf_true; + } else if (data_count == 1 && local->ta_child_up) { + return _gf_true; + } - if (!priv) - return 0; + return _gf_false; +} - child_up = priv->child_up; +static gf_boolean_t +afr_is_add_replica_mount_lookup_on_root(call_frame_t *frame) +{ + afr_local_t *local = NULL; - switch (event) { - case GF_EVENT_CHILD_UP: - i = find_child_index (this, data); + if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) + return _gf_false; - /* temporarily - afr_attempt_lock_recovery (this, i); - */ + local = frame->local; - child_up[i] = 1; + if (local->op != GF_FOP_LOOKUP) + /* TODO:If the replica count is being increased on a plain distribute + * volume that was never mounted, we need to allow setxattr on '/' with + * GF_CLIENT_PID_NO_ROOT_SQUASH to accomodate for DHT layout setting */ + return _gf_false; - LOCK (&priv->lock); - { - priv->up_count++; - } - UNLOCK (&priv->lock); + if (local->inode == NULL) + return _gf_false; - /* - if all the children were down, and one child came up, - send notify to parent - */ - - for (i = 0; i < priv->child_count; i++) - if (child_up[i] == 1) - up_children++; - - if (up_children == 1) { - gf_log (this->name, GF_LOG_NORMAL, - "Subvolume '%s' came back up; " - "going online.", ((xlator_t *)data)->name); - - default_notify (this, event, data); - } else { - default_notify (this, GF_EVENT_CHILD_MODIFIED, data); - } + if (!__is_root_gfid(local->inode->gfid)) + return _gf_false; - break; + return _gf_true; +} - case GF_EVENT_CHILD_DOWN: - i = find_child_index (this, data); +gf_boolean_t +afr_lookup_has_quorum(call_frame_t *frame, const unsigned int up_children_count) +{ + if (frame && (up_children_count > 0) && + afr_is_add_replica_mount_lookup_on_root(frame)) + return _gf_true; - child_up[i] = 0; + return _gf_false; +} - LOCK (&priv->lock); - { - priv->down_count++; - } - UNLOCK (&priv->lock); +void +afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + unsigned char *success_replies = NULL; - /* - if all children are down, and this was the last to go down, - send notify to parent - */ - - for (i = 0; i < priv->child_count; i++) - if (child_up[i] == 0) - down_children++; - - if (down_children == priv->child_count) { - gf_log (this->name, GF_LOG_ERROR, - "All subvolumes are down. Going offline " - "until atleast one of them comes back up."); - - default_notify (this, event, data); - } else { - default_notify (this, GF_EVENT_CHILD_MODIFIED, data); - } + success_replies = alloca0(priv->child_count); + afr_fill_success_replies(local, priv, success_replies); - break; + if (priv->quorum_count && !afr_has_quorum(success_replies, this, NULL)) { + local->op_errno = afr_final_errno(local, priv); + if (!local->op_errno) + local->op_errno = afr_quorum_errno(priv); + local->op_ret = -1; + } +} - default: - default_notify (this, event, data); +gf_boolean_t +afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, int child) +{ + int *pending = NULL; + int ret = 0; + int i = 0; + + ret = dict_get_ptr(dict, priv->pending_key[child], (void *)&pending); + if (ret == 0) { + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + /* Not doing a ntoh32(pending) as we just want to check + * if it is non-zero or not. */ + if (pending[i]) { + return _gf_true; + } } + } - return 0; - + return _gf_false; } diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 19ddcbda762..f8bf8340dab 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -1,753 +1,346 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> #include <string.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "checksum.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/dict.h> +#include <glusterfs/list.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> #include "afr.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" - - -int -afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - afr_set_opendir_done (this, local->fd->inode); - - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); - - return 0; -} - - -gf_boolean_t -__checksums_differ (uint32_t *checksum, int child_count, - unsigned char *child_up) -{ - int ret = _gf_false; - int i = 0; - - uint32_t cksum; - - cksum = checksum[0]; - - for (i = 0; i < child_count; i++) { - if (!child_up[i]) - continue; - - if (cksum != checksum[i]) { - ret = _gf_true; - break; - } - - cksum = checksum[i]; - } - - return ret; -} - +#include "afr-transaction.h" int32_t -afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) +afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - - int child_index = 0; - - uint32_t entry_cksum; - - int call_count = 0; - off_t last_offset = 0; - char sh_type_str[256] = {0,}; + afr_local_t *local = NULL; + int call_count = -1; + int32_t child_index = 0; + afr_fd_ctx_t *fd_ctx = NULL; - priv = this->private; - local = frame->local; - sh = &local->self_heal; + local = frame->local; + fd_ctx = local->fd_ctx; + child_index = (long)cookie; - child_index = (long) cookie; + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + LOCK(&frame->lock); + { if (op_ret == -1) { - local->op_ret = -1; - local->op_ret = op_errno; - goto out; + local->op_errno = op_errno; + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { + local->op_ret = op_ret; + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); } + call_count = --local->call_count; + } + UNLOCK(&frame->lock); - if (op_ret == 0) - goto out; + if (call_count == 0) { + afr_handle_replies_quorum(frame, this); + AFR_STACK_UNWIND(opendir, frame, local->op_ret, local->op_errno, + local->fd, NULL); + } - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - entry_cksum = gf_rsync_weak_checksum (entry->d_name, - strlen (entry->d_name)); - local->cont.opendir.checksum[child_index] ^= entry_cksum; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - } - - /* read more entries */ - - STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->readdir, - local->fd, 131072, last_offset); - -out: - if ((op_ret == 0) || (op_ret == -1)) { - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (__checksums_differ (local->cont.opendir.checksum, - priv->child_count, - local->child_up)) { - - sh->need_entry_self_heal = _gf_true; - sh->forced_merge = _gf_true; - sh->type = local->fd->inode->ia_type; - sh->background = _gf_false; - sh->unwind = afr_examine_dir_sh_unwind; - - afr_self_heal_type_str_get(&local->self_heal, - sh_type_str, - sizeof(sh_type_str)); - gf_log (this->name, GF_LOG_NORMAL, - "%s self-heal triggered. path: %s, " - "reason: checksums of directory differ," - " forced merge option set", - sh_type_str, local->loc.path); - - afr_self_heal (frame, this); - } else { - afr_set_opendir_done (this, local->fd->inode); - - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); - } - } - } - - return 0; + return 0; } - int -afr_examine_dir (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - int i; - int call_count = 0; - - local = frame->local; - priv = this->private; - - local->cont.opendir.checksum = GF_CALLOC (priv->child_count, - sizeof (*local->cont.opendir.checksum), - gf_afr_mt_int32_t); - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->readdir, - local->fd, 131072, 0); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int32_t -afr_opendir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int32_t up_children_count = 0; - int ret = -1; - - int call_count = -1; - - priv = this->private; - local = frame->local; - - up_children_count = afr_up_children_count (priv->child_count, - local->child_up); - - LOCK (&frame->lock); - { - if (op_ret >= 0) - local->op_ret = op_ret; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (local->op_ret == 0) { - - ret = afr_fd_ctx_set (this, local->fd); - - if (ret) { - local->op_ret = -1; - local->op_errno = -1; - gf_log (this->name, GF_LOG_ERROR, " failed to " - "set fd ctx for fd %p", local->fd); - goto out; - } - if (!afr_is_opendir_done (this, local->fd->inode) && - up_children_count > 1) { - - /* - * This is the first opendir on this inode. We need - * to check if the directory's entries are the same - * on all subvolumes. This is needed in addition - * to regular entry self-heal because the readdir - * call is sent only to the first subvolume, and - * thus files that exist only there will never be healed - * otherwise (assuming changelog shows no anamolies). - */ - - gf_log (this->name, GF_LOG_TRACE, - "reading contents of directory %s looking for mismatch", - local->loc.path); - - afr_examine_dir (frame, this); - - } else { - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); - } - } else { -out: - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); - } - } - - return 0; -} - - -int32_t -afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd) +afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - int child_count = 0; - int i = 0; - - int ret = -1; - int call_count = -1; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = -1; + int32_t op_errno = ENOMEM; + afr_fd_ctx_t *fd_ctx = NULL; - int32_t op_ret = -1; - int32_t op_errno = 0; + priv = this->private; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - priv = this->private; + local->op = GF_FOP_OPENDIR; - child_count = priv->child_count; - - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - loc_copy (&local->loc, loc); - - frame->local = local; - local->fd = fd_ref (fd); - - call_count = local->call_count; - - for (i = 0; i < child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_opendir_cbk, - priv->children[i], - priv->children[i]->fops->opendir, - loc, fd); - - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (opendir, frame, op_ret, op_errno, fd); - } + if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) { + op_errno = afr_quorum_errno(priv); + goto out; + } - return 0; -} + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) + goto out; -/** - * Common algorithm for directory read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - * try the next child - * - * Applicable to: readdir - */ + loc_copy(&local->loc, loc); + local->fd = fd_ref(fd); + local->fd_ctx = fd_ctx; -struct entry_name { - char *name; - struct list_head list; -}; + call_count = local->call_count; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE(frame, afr_opendir_cbk, (void *)(long)i, + priv->children[i], + priv->children[i]->fops->opendir, loc, fd, NULL); -static gf_boolean_t -remembered_name (const char *name, struct list_head *entries) -{ - struct entry_name *e; - gf_boolean_t ret = _gf_false; - - list_for_each_entry (e, entries, list) { - if (!strcmp (name, e->name)) { - ret = _gf_true; - goto out; - } + if (!--call_count) + break; } + } + return 0; out: - return ret; + AFR_STACK_UNWIND(opendir, frame, -1, op_errno, fd, NULL); + return 0; } - -static void -afr_remember_entries (gf_dirent_t *entries, fd_t *fd) +static int +afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol) { - struct entry_name *n = NULL; - gf_dirent_t * entry = NULL; - - int ret = 0; - - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_DEBUG, - "could not get fd ctx for fd=%p", fd); - return; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - list_for_each_entry (entry, &entries->list, list) { - n = GF_CALLOC (1, sizeof (*n), gf_afr_mt_entry_name); - n->name = gf_strdup (entry->d_name); - INIT_LIST_HEAD (&n->list); + int gen = 0; + int entry_read_subvol = 0; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + data_readable = alloca0(priv->child_count); + metadata_readable = alloca0(priv->child_count); + + afr_inode_read_subvol_get(inode, this, data_readable, metadata_readable, + &gen); + + if (gen != priv->event_generation || !data_readable[par_read_subvol] || + !metadata_readable[par_read_subvol]) + return -1; + + /* Once the control reaches the following statement, it means that the + * parent's read subvol is perfectly readable. So calling + * either afr_data_subvol_get() or afr_metadata_subvol_get() would + * yield the same result. Hence, choosing afr_data_subvol_get() below. + */ + + if (!priv->consistent_metadata) + return 0; - list_add (&n->list, &fd_ctx->entries); - } + /* For an inode fetched through readdirp which is yet to be linked, + * inode ctx would not be initialised (yet). So this function returns + * -1 above due to gen being 0, which is why it is OK to pass NULL for + * read_subvol_args here. + */ + entry_read_subvol = afr_data_subvol_get(inode, this, NULL, NULL, NULL, + NULL); + if (entry_read_subvol != par_read_subvol) + return -1; + + return 0; } - -static off_t -afr_filter_entries (gf_dirent_t *entries, fd_t *fd) +static void +afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries, + int subvol, gf_dirent_t *entries, fd_t *fd) { - gf_dirent_t *entry, *tmp; - int ret = 0; - - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; - - off_t offset = 0; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_DEBUG, - "could not get fd ctx for fd=%p", fd); - return -1; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - offset = entry->d_off; - - if (remembered_name (entry->d_name, &fd_ctx->entries)) { - list_del (&entry->list); - GF_FREE (entry); - } - } + int ret = -1; + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + xlator_t *this = NULL; + afr_private_t *priv = NULL; + gf_boolean_t need_heal = _gf_false; + gf_boolean_t validate_subvol = _gf_false; + + this = THIS; + priv = this->private; + + need_heal = afr_get_need_heal(this); + validate_subvol = need_heal | priv->consistent_metadata; + + list_for_each_entry_safe(entry, tmp, &subvol_entries->list, list) + { + if (afr_is_private_directory(priv, fd->inode->gfid, entry->d_name, + frame->root->pid)) { + continue; + } - return offset; -} + list_del_init(&entry->list); + list_add_tail(&entry->list, &entries->list); + if (!validate_subvol) + continue; -static void -afr_forget_entries (fd_t *fd) -{ - struct entry_name *entry, *tmp; - int ret = 0; - - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_DEBUG, - "could not get fd ctx for fd=%p", fd); - return; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) { - GF_FREE (entry->name); - list_del (&entry->list); - GF_FREE (entry); - } + if (entry->inode) { + ret = afr_validate_read_subvol(entry->inode, this, subvol); + if (ret == -1) { + inode_unref(entry->inode); + entry->inode = NULL; + continue; + } + } + } } - int32_t -afr_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) +afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; + afr_local_t *local = NULL; + gf_dirent_t entries; - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; + INIT_LIST_HEAD(&entries.list); - int child_index = -1; + local = frame->local; - priv = this->private; - local = frame->local; - child_index = (long) cookie; + if (op_ret < 0 && !local->cont.readdir.offset) { + /* failover only if this was first readdir, detected + by offset == 0 */ + local->op_ret = op_ret; + local->op_errno = op_errno; - if (op_ret != -1) { - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - entry->d_ino = afr_itransform (entry->d_ino, - priv->child_count, - child_index); + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - if ((local->fd->inode == local->fd->inode->table->root) - && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { - list_del_init (&entry->list); - GF_FREE (entry); - } - } - } + if (op_ret >= 0) + afr_readdir_transform_entries(frame, subvol_entries, (long)cookie, + &entries, local->fd); - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries); + AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata); - return 0; -} + gf_dirent_free(&entries); + return 0; +} -int32_t -afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries) +int +afr_readdir_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - ino_t inum = 0; - - int call_child = 0; - int ret = 0; - - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - - int child_index = -1; - - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - off_t offset = 0; - - priv = this->private; - children = priv->children; - - local = frame->local; - - child_index = (long) cookie; - - if (priv->strict_readdir) { - ret = fd_ctx_get (local->fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not get fd ctx for fd=%p", local->fd); - op_ret = -1; - op_errno = -ret; - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - if (child_went_down (op_ret, op_errno)) { - if (all_tried (child_index, priv->child_count)) { - goto out; - } - - call_child = ++child_index; - - gf_log (this->name, GF_LOG_TRACE, - "starting readdir afresh on child %d, offset %"PRId64, - call_child, (uint64_t) 0); - - fd_ctx->failed_over = _gf_true; - - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdirp, local->fd, - local->cont.readdir.size, 0); - return 0; - } - } - - if (op_ret != -1) { - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - inum = afr_itransform (entry->d_ino, priv->child_count, - child_index); - entry->d_ino = inum; - inum = afr_itransform (entry->d_stat.ia_ino, - priv->child_count, child_index); - entry->d_stat.ia_ino = inum; - - if ((local->fd->inode == local->fd->inode->table->root) - && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { - list_del_init (&entry->list); - GF_FREE (entry); - } - } - } - - if (priv->strict_readdir) { - if (fd_ctx->failed_over) { - if (list_empty (&entries->list)) { - goto out; - } - - offset = afr_filter_entries (entries, local->fd); - - afr_remember_entries (entries, local->fd); - - if (list_empty (&entries->list)) { - /* All the entries we got were duplicate. We - shouldn't send an empty list now, because - that'll make the application stop reading. So - try to get more entries */ - - gf_log (this->name, GF_LOG_TRACE, - "trying to fetch non-duplicate entries from offset %"PRId64", child %s", - offset, children[child_index]->name); - - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) child_index, - children[child_index], - children[child_index]->fops->readdirp, - local->fd, local->cont.readdir.size, offset); - return 0; - } - } else { - afr_remember_entries (entries, local->fd); - } - } - -out: - AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries); - + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + + priv = this->private; + local = frame->local; + fd_ctx = afr_fd_ctx_get(local->fd, this); + if (!fd_ctx) { + local->op_errno = EINVAL; + local->op_ret = -1; + } + + if (subvol == -1 || !fd_ctx) { + AFR_STACK_UNWIND(readdir, frame, local->op_ret, local->op_errno, 0, 0); return 0; + } + + fd_ctx->readdir_subvol = subvol; + + if (local->op == GF_FOP_READDIR) + STACK_WIND_COOKIE(frame, afr_readdir_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdir, local->fd, + local->cont.readdir.size, local->cont.readdir.offset, + local->xdata_req); + else + STACK_WIND_COOKIE(frame, afr_readdir_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdirp, local->fd, + local->cont.readdir.size, local->cont.readdir.offset, + local->xdata_req); + return 0; } - -int32_t -afr_do_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, int whichop) +int +afr_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, int whichop, dict_t *dict) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; - - int ret = -1; - - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - children = priv->children; - - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - frame->local = local; - - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } - - local->fd = fd_ref (fd); - local->cont.readdir.size = size; - - if (priv->strict_readdir) { - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not get fd ctx for fd=%p", fd); - op_errno = -ret; - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - if (fd_ctx->last_tried != call_child) { - gf_log (this->name, GF_LOG_TRACE, - "first up child has changed from %d to %d, restarting readdir from offset 0", - fd_ctx->last_tried, call_child); - - fd_ctx->failed_over = _gf_true; - offset = 0; - } - - fd_ctx->last_tried = call_child; - } - - if (whichop == GF_FOP_READDIR) - STACK_WIND_COOKIE (frame, afr_readdir_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdir, fd, - size, offset); - else - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdirp, fd, - size, offset); - - op_ret = 0; + afr_local_t *local = NULL; + int32_t op_errno = 0; + int subvol = -1; + afr_fd_ctx_t *fd_ctx = NULL; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) { + op_errno = EINVAL; + goto out; + } + + local->op = whichop; + local->fd = fd_ref(fd); + local->cont.readdir.size = size; + local->cont.readdir.offset = offset; + local->xdata_req = (dict) ? dict_ref(dict) : NULL; + + subvol = fd_ctx->readdir_subvol; + + if (offset == 0 || subvol == -1) { + /* First readdir has option of failing over and selecting + an appropriate read subvolume */ + afr_read_txn(frame, this, fd->inode, afr_readdir_wind, + AFR_DATA_TRANSACTION); + } else { + /* But continued readdirs MUST stick to the same subvolume + without an option to failover */ + afr_readdir_wind(frame, this, subvol); + } + + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, NULL); - } - return 0; + AFR_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL); + return 0; } - int32_t -afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) +afr_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR); - return 0; -} + afr_do_readdir(frame, this, fd, size, offset, GF_FOP_READDIR, xdata); + return 0; +} int32_t -afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) +afr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *dict) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP); - return 0; -} + afr_do_readdir(frame, this, fd, size, offset, GF_FOP_READDIRP, dict); + return 0; +} int32_t -afr_releasedir (xlator_t *this, fd_t *fd) +afr_releasedir(xlator_t *this, fd_t *fd) { - afr_forget_entries (fd); - afr_cleanup_fd_ctx (this, fd); + afr_cleanup_fd_ctx(this, fd); - return 0; + return 0; } diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h index 40c7b6aef28..773e925ec6c 100644 --- a/xlators/cluster/afr/src/afr-dir-read.h +++ b/xlators/cluster/afr/src/afr-dir-read.h @@ -1,50 +1,33 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __DIR_READ_H__ #define __DIR_READ_H__ - int32_t -afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd); +afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata); int32_t -afr_releasedir (xlator_t *this, fd_t *fd); +afr_releasedir(xlator_t *this, fd_t *fd); int32_t -afr_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); - +afr_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata); int32_t -afr_readdirp (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); +afr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *dict); int32_t -afr_getdents (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, int32_t flag); - - -int32_t -afr_checksum (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags); - +afr_checksum(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + dict_t *xdata); #endif /* __DIR_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index af42e7e06a0..b7cceb79158 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -1,1349 +1,937 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" +#include <glusterfs/glusterfs.h> #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" +#include <glusterfs/dict.h> +#include <glusterfs/logging.h> +#include <glusterfs/list.h> +#include <glusterfs/defaults.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include <glusterfs/byte-order.h> #include "afr.h" #include "afr-transaction.h" - void -afr_build_parent_loc (loc_t *parent, loc_t *child) +afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this); + +int +afr_build_parent_loc(loc_t *parent, loc_t *child, int32_t *op_errno) { - char *tmp = NULL; + int ret = -1; + char *child_path = NULL; + + if (!child->parent) { + if (op_errno) + *op_errno = EINVAL; + goto out; + } + + child_path = gf_strdup(child->path); + if (!child_path) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + parent->path = gf_strdup(dirname(child_path)); + if (!parent->path) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + parent->inode = inode_ref(child->parent); + gf_uuid_copy(parent->gfid, child->pargfid); + + ret = 0; +out: + GF_FREE(child_path); - if (!child->parent) { - loc_copy (parent, child); - return; - } + return ret; +} - tmp = gf_strdup (child->path); - parent->path = gf_strdup (dirname (tmp)); - GF_FREE (tmp); +static void +__afr_dir_write_finalize(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int inode_read_subvol = -1; + int parent_read_subvol = -1; + int parent2_read_subvol = -1; + int i = 0; + afr_read_subvol_args_t args = { + 0, + }; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == -1) + continue; + gf_uuid_copy(args.gfid, local->replies[i].poststat.ia_gfid); + args.ia_type = local->replies[i].poststat.ia_type; + break; + } + + if (local->inode) { + if (local->op != GF_FOP_RENAME && local->op != GF_FOP_LINK) + afr_replies_interpret(frame, this, local->inode, NULL); + + inode_read_subvol = afr_data_subvol_get(local->inode, this, NULL, NULL, + NULL, &args); + } + + if (local->parent) + parent_read_subvol = afr_data_subvol_get(local->parent, this, NULL, + local->readable, NULL, NULL); + + if (local->parent2) + parent2_read_subvol = afr_data_subvol_get(local->parent2, this, NULL, + local->readable2, NULL, NULL); + + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + afr_pick_error_xdata(local, priv, local->parent, local->readable, + local->parent2, local->readable2); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0) { + if (local->inode) + afr_inode_need_refresh_set(local->inode, this); + if (local->parent) + afr_inode_need_refresh_set(local->parent, this); + if (local->parent2) + afr_inode_need_refresh_set(local->parent2, this); + continue; + } - parent->name = strrchr (parent->path, '/'); - if (parent->name) - parent->name++; + if (local->op_ret == -1) { + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + + local->cont.dir_fop.buf = local->replies[i].poststat; + local->cont.dir_fop.preparent = local->replies[i].preparent; + local->cont.dir_fop.postparent = local->replies[i].postparent; + local->cont.dir_fop.prenewparent = local->replies[i].preparent2; + local->cont.dir_fop.postnewparent = local->replies[i].postparent2; + if (local->xdata_rsp) { + dict_unref(local->xdata_rsp); + local->xdata_rsp = NULL; + } + + if (local->replies[i].xdata) + local->xdata_rsp = dict_ref(local->replies[i].xdata); + continue; + } - parent->inode = inode_ref (child->parent); - parent->parent = inode_parent (parent->inode, 0, NULL); - parent->ino = parent->inode->ino; -} + if (i == inode_read_subvol) { + local->cont.dir_fop.buf = local->replies[i].poststat; + if (local->replies[i].xdata) { + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); + local->xdata_rsp = dict_ref(local->replies[i].xdata); + } + } -/* {{{ create */ + if (i == parent_read_subvol) { + local->cont.dir_fop.preparent = local->replies[i].preparent; + local->cont.dir_fop.postparent = local->replies[i].postparent; + } -int -afr_create_unwind (call_frame_t *frame, xlator_t *this) -{ - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.create.read_child_buf.ia_ino) { - unwind_buf = &local->cont.create.read_child_buf; - } else { - unwind_buf = &local->cont.create.buf; - } - - unwind_buf->ia_ino = local->cont.create.ino; - - local->cont.create.preparent.ia_ino = local->cont.create.parent_ino; - local->cont.create.postparent.ia_ino = local->cont.create.parent_ino; - - AFR_STACK_UNWIND (create, main_frame, - local->op_ret, local->op_errno, - local->cont.create.fd, - local->cont.create.inode, - unwind_buf, &local->cont.create.preparent, - &local->cont.create.postparent); + if (i == parent2_read_subvol) { + local->cont.dir_fop.prenewparent = local->replies[i].preparent2; + local->cont.dir_fop.postnewparent = local->replies[i].postparent2; } - - return 0; + } } - -int -afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - fd_t *fd, inode_t *inode, struct iatt *buf, - struct iatt *preparent, struct iatt *postparent) +static void +__afr_dir_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + int op_ret, int op_errno, struct iatt *poststat, + struct iatt *preparent, struct iatt *postparent, + struct iatt *preparent2, struct iatt *postparent2, + dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; - - int ret = 0; - - int call_count = -1; - int child_index = -1; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - ret = afr_fd_ctx_set (this, fd); - - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set ctx on fd=%p", fd); - - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - - ret = fd_ctx_get (fd, this, &ctx); - - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not get fd ctx for fd=%p", fd); - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->opened_on[child_index] = 1; - fd_ctx->flags = local->cont.create.flags; - - if (local->success_count == 0) { - local->cont.create.buf = *buf; - - local->cont.create.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - child_index); - - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - - if (child_index == local->first_up_child) { - local->cont.create.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - local->first_up_child); - } - - if (child_index == local->read_child_index) { - local->cont.create.read_child_buf = *buf; - local->cont.create.preparent = *preparent; - local->cont.create.postparent = *postparent; - } - - local->cont.create.inode = inode; - - local->success_count++; - } - - local->op_errno = op_errno; - } - -unlock: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + afr_local_t *local = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + + local = frame->local; + fd_ctx = local->fd_ctx; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (xdata) + local->replies[child_index].xdata = dict_ref(xdata); + + if (op_ret >= 0) { + if (poststat) + local->replies[child_index].poststat = *poststat; + if (preparent) + local->replies[child_index].preparent = *preparent; + if (postparent) + local->replies[child_index].postparent = *postparent; + if (preparent2) + local->replies[child_index].preparent2 = *preparent2; + if (postparent2) + local->replies[child_index].postparent2 = *postparent2; + if (fd_ctx) + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + } else { + if (op_errno != ENOTEMPTY) + afr_transaction_fop_failed(frame, this, child_index); + if (fd_ctx) + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } + + return; } - -int -afr_create_wind (call_frame_t *frame, xlator_t *this) +static int +__afr_dir_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + struct iatt *preparent2, struct iatt *postparent2, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_create_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->create, - &local->loc, - local->cont.create.flags, - local->cont.create.mode, - local->cont.create.fd, - local->cont.create.params); - if (!--call_count) - break; - } - } - - return 0; -} + afr_local_t *local = NULL; + int child_index = (long)cookie; + int call_count = -1; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + LOCK(&frame->lock); + { + __afr_dir_write_fill(frame, this, child_index, op_ret, op_errno, buf, + preparent, postparent, preparent2, postparent2, + xdata); + call_count = --local->call_count; + } + UNLOCK(&frame->lock); + + if (call_count == 0) { + __afr_dir_write_finalize(frame, this); + + if (afr_txn_nothing_failed(frame, this)) { + /*if it did pre-op, it will do post-op changing ctime*/ + if (priv->consistent_metadata && afr_needs_changelog_update(local)) + afr_zero_fill_stat(local); + local->transaction.unwind(frame, this); + } + + afr_mark_entry_pending_changelog(frame, this); + afr_transaction_resume(frame, this); + } + + return 0; +} int -afr_create_done (call_frame_t *frame, xlator_t *this) +afr_mark_new_entry_changelog_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) { - afr_local_t * local = NULL; + int call_count = 0; - local = frame->local; + call_count = afr_frame_return(frame); - local->transaction.unwind (frame, this); + if (call_count == 0) + AFR_STACK_DESTROY(frame); - AFR_STACK_DESTROY (frame); - - return 0; + return 0; } - -int -afr_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) +void +afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; + call_frame_t *new_frame = NULL; + afr_local_t *local = NULL; + afr_local_t *new_local = NULL; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int32_t **changelog = NULL; + int i = 0; + int op_errno = ENOMEM; + unsigned char *pending = NULL; + int call_count = 0; + + local = frame->local; + priv = this->private; + + new_frame = copy_frame(frame); + if (!new_frame) + goto out; + + new_local = AFR_FRAME_INIT(new_frame, op_errno); + if (!new_local) + goto out; + + xattr = dict_new(); + if (!xattr) + goto out; + + pending = alloca0(priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && + !local->transaction.failed_subvols[i]) { + call_count++; + continue; + } + pending[i] = 1; + } - int op_ret = -1; - int op_errno = 0; + changelog = afr_mark_pending_changelog(priv, pending, xattr, + local->cont.dir_fop.buf.ia_type); + if (!changelog) + goto out; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + new_local->pending = changelog; + gf_uuid_copy(new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid); + new_local->loc.inode = inode_ref(local->inode); - priv = this->private; + new_local->call_count = call_count; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } + for (i = 0; i < priv->child_count; i++) { + if (pending[i]) + continue; - ALLOC_OR_GOTO (local, afr_local_t, out); + STACK_WIND_COOKIE(new_frame, afr_mark_new_entry_changelog_cbk, + (void *)(long)i, priv->children[i], + priv->children[i]->fops->xattrop, &new_local->loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL); + if (!--call_count) + break; + } - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + new_frame = NULL; +out: + if (new_frame) + AFR_STACK_DESTROY(new_frame); + if (xattr) + dict_unref(xattr); + return; +} - transaction_frame->local = local; +void +afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int pre_op_count = 0; + int failed_count = 0; + unsigned char *success_replies = NULL; - loc_copy (&local->loc, loc); + local = frame->local; + priv = this->private; - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + if (local->op_ret < 0) + return; - local->cont.create.flags = flags; - local->cont.create.mode = mode; - local->cont.create.fd = fd_ref (fd); - if (params) - local->cont.create.params = dict_ref (params); + if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD && + local->op != GF_FOP_MKDIR) + return; - if (loc->parent) - local->cont.create.parent_ino = loc->parent->ino; + pre_op_count = AFR_COUNT(local->transaction.pre_op, priv->child_count); + failed_count = AFR_COUNT(local->transaction.failed_subvols, + priv->child_count); - local->transaction.fop = afr_create_wind; - local->transaction.done = afr_create_done; - local->transaction.unwind = afr_create_unwind; + /* FOP succeeded on all bricks. */ + if (pre_op_count == priv->child_count && !failed_count) + return; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + /* FOP did not suceed on quorum no. of bricks. */ + success_replies = alloca0(priv->child_count); + afr_fill_success_replies(local, priv, success_replies); + if (!afr_has_quorum(success_replies, this, NULL)) + return; - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); + if (priv->thin_arbiter_count) { + /*Mark new entry using ta file*/ + local->is_new_entry = _gf_true; + return; + } - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + afr_mark_new_entry_changelog(frame, this); - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (create, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL, NULL); - } - - return 0; + return; } -/* }}} */ - -/* {{{ mknod */ +/* {{{ create */ int -afr_mknod_unwind (call_frame_t *frame, xlator_t *this) +afr_create_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - struct iatt *unwind_buf = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.mknod.read_child_buf.ia_ino) { - unwind_buf = &local->cont.mknod.read_child_buf; - } else { - unwind_buf = &local->cont.mknod.buf; - } - - unwind_buf->ia_ino = local->cont.mknod.ino; - - local->cont.mknod.preparent.ia_ino = local->cont.mknod.parent_ino; - local->cont.mknod.postparent.ia_ino = local->cont.mknod.parent_ino; - - AFR_STACK_UNWIND (mknod, main_frame, - local->op_ret, local->op_errno, - local->cont.mknod.inode, - unwind_buf, &local->cont.mknod.preparent, - &local->cont.mknod.postparent); - } + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - return 0; -} + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + + if (!main_frame) + return 0; + + AFR_STACK_UNWIND(create, main_frame, local->op_ret, local->op_errno, + local->cont.create.fd, local->inode, + &local->cont.dir_fop.buf, &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; +} int -afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, +afr_create_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = -1; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0){ - local->cont.mknod.buf = *buf; - local->cont.mknod.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - child_index); - - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - - if (child_index == local->first_up_child) { - local->cont.mknod.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - local->first_up_child); - } - - if (child_index == local->read_child_index) { - local->cont.mknod.read_child_buf = *buf; - local->cont.mknod.preparent = *preparent; - local->cont.mknod.postparent = *postparent; - } - - local->cont.mknod.inode = inode; - - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } - -int32_t -afr_mknod_wind (call_frame_t *frame, xlator_t *this) +int +afr_create_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mknod, - &local->loc, local->cont.mknod.mode, - local->cont.mknod.dev, - local->cont.mknod.params); - if (!--call_count) - break; - } - } - - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_create_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->create, &local->loc, + local->cont.create.flags, local->cont.create.mode, + local->umask, local->cont.create.fd, local->xdata_req); + return 0; } - int -afr_mknod_done (call_frame_t *frame, xlator_t *this) +afr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - AFR_STACK_DESTROY (frame); + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + + local->fd_ctx = afr_fd_ctx_get(fd, this); + if (!local->fd_ctx) + goto out; + + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + local->op = GF_FOP_CREATE; + local->cont.create.flags = flags; + local->fd_ctx->flags = flags; + local->cont.create.mode = mode; + local->cont.create.fd = fd_ref(fd); + local->umask = umask; + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_create_wind; + local->transaction.unwind = afr_create_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; } +/* }}} */ + +/* {{{ mknod */ int -afr_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev, dict_t *params) +afr_mknod_unwind(call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; - - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = frame->local; - transaction_frame->local = local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; - loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); - - local->cont.mknod.mode = mode; - local->cont.mknod.dev = dev; - if (params) - local->cont.mknod.params = dict_ref (params); - - if (loc->parent) - local->cont.mknod.parent_ino = loc->parent->ino; - - local->transaction.fop = afr_mknod_wind; - local->transaction.done = afr_mknod_done; - local->transaction.unwind = afr_mknod_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, loc); + AFR_STACK_UNWIND(mknod, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; +} - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); +int +afr_mknod_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); +} - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); +int +afr_mknod_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_mknod_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->mknod, &local->loc, + local->cont.mknod.mode, local->cont.mknod.dev, + local->umask, local->xdata_req); + return 0; +} - op_ret = 0; +int +afr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + local->op = GF_FOP_MKNOD; + local->cont.mknod.mode = mode; + local->cont.mknod.dev = dev; + local->umask = umask; + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_mknod_wind; + local->transaction.unwind = afr_mknod_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (mknod, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); - } - - return 0; + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); + + AFR_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + return 0; } /* }}} */ /* {{{ mkdir */ - int -afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) +afr_mkdir_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - struct iatt *unwind_buf = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.mkdir.read_child_buf.ia_ino) { - unwind_buf = &local->cont.mkdir.read_child_buf; - } else { - unwind_buf = &local->cont.mkdir.buf; - } - - unwind_buf->ia_ino = local->cont.mkdir.ino; - - local->cont.mkdir.preparent.ia_ino = local->cont.mkdir.parent_ino; - local->cont.mkdir.postparent.ia_ino = local->cont.mkdir.parent_ino; - - AFR_STACK_UNWIND (mkdir, main_frame, - local->op_ret, local->op_errno, - local->cont.mkdir.inode, - unwind_buf, &local->cont.mkdir.preparent, - &local->cont.mkdir.postparent); - } + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - return 0; -} + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; -int -afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = -1; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) { - local->cont.mkdir.buf = *buf; - - local->cont.mkdir.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - child_index); - - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - - if (child_index == local->first_up_child) { - local->cont.mkdir.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - local->first_up_child); - } - - if (child_index == local->read_child_index) { - local->cont.mkdir.read_child_buf = *buf; - local->cont.mkdir.preparent = *preparent; - local->cont.mkdir.postparent = *postparent; - } - - local->cont.mkdir.inode = inode; - - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + AFR_STACK_UNWIND(mkdir, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } - int -afr_mkdir_wind (call_frame_t *frame, xlator_t *this) +afr_mkdir_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mkdir, - &local->loc, local->cont.mkdir.mode, - local->cont.mkdir.params); - if (!--call_count) - break; - } - } - - return 0; + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } - int -afr_mkdir_done (call_frame_t *frame, xlator_t *this) +afr_mkdir_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = NULL; - - local = frame->local; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local->transaction.unwind (frame, this); + local = frame->local; + priv = this->private; - AFR_STACK_DESTROY (frame); - - return 0; + STACK_WIND_COOKIE(frame, afr_mkdir_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->mkdir, &local->loc, + local->cont.mkdir.mode, local->umask, local->xdata_req); + return 0; } - int -afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params) +afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; - - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - transaction_frame->local = local; - - loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); - - local->cont.mkdir.mode = mode; - if (params) - local->cont.mkdir.params = dict_ref (params); - - if (loc->parent) - local->cont.mkdir.parent_ino = loc->parent->ino; - - local->transaction.fop = afr_mkdir_wind; - local->transaction.done = afr_mkdir_done; - local->transaction.unwind = afr_mkdir_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, loc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - - op_ret = 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + local->cont.mkdir.mode = mode; + local->umask = umask; + + if (!xdata || !dict_get_sizen(xdata, "gfid-req")) { + op_errno = EPERM; + gf_msg_callingfn(this->name, GF_LOG_WARNING, op_errno, + AFR_MSG_GFID_NULL, + "mkdir: %s is received " + "without gfid-req %p", + loc->path, xdata); + goto out; + } + + local->xdata_req = dict_copy_with_ref(xdata, NULL); + if (!local->xdata_req) { + op_errno = ENOMEM; + goto out; + } + + local->op = GF_FOP_MKDIR; + local->transaction.wind = afr_mkdir_wind; + local->transaction.unwind = afr_mkdir_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (mkdir, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + return 0; } /* }}} */ /* {{{ link */ - -int -afr_link_unwind (call_frame_t *frame, xlator_t *this) -{ - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - struct iatt *unwind_buf = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.link.read_child_buf.ia_ino) { - unwind_buf = &local->cont.link.read_child_buf; - } else { - unwind_buf = &local->cont.link.buf; - } - - unwind_buf->ia_ino = local->cont.link.ino; - - local->cont.link.preparent.ia_ino = local->cont.link.parent_ino; - local->cont.link.postparent.ia_ino = local->cont.link.parent_ino; - - AFR_STACK_UNWIND (link, main_frame, - local->op_ret, local->op_errno, - local->cont.link.inode, - unwind_buf, &local->cont.link.preparent, - &local->cont.link.postparent); - } - - return 0; -} - - int -afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) +afr_link_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = -1; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) { - local->cont.link.buf = *buf; - - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - - if (child_index == local->read_child_index) { - local->cont.link.read_child_buf = *buf; - local->cont.link.preparent = *preparent; - local->cont.link.postparent = *postparent; - } + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - local->cont.link.inode = inode; + local = frame->local; - local->success_count++; - } + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + AFR_STACK_UNWIND(link, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } - int -afr_link_wind (call_frame_t *frame, xlator_t *this) +afr_link_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->link, - &local->loc, - &local->newloc); - - if (!--call_count) - break; - } - } - - return 0; + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } - int -afr_link_done (call_frame_t *frame, xlator_t *this) +afr_link_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = frame->local; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local->transaction.unwind (frame, this); + local = frame->local; + priv = this->private; - AFR_STACK_DESTROY (frame); - - return 0; + STACK_WIND_COOKIE(frame, afr_link_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->link, &local->loc, + &local->newloc, local->xdata_req); + return 0; } - int -afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) +afr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - int ret = -1; + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - int op_ret = -1; - int op_errno = 0; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + loc_copy(&local->loc, oldloc); + loc_copy(&local->newloc, newloc); - priv = this->private; + local->inode = inode_ref(oldloc->inode); + local->parent = inode_ref(newloc->parent); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - ALLOC_OR_GOTO (local, afr_local_t, out); + if (!local->xdata_req) + goto out; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local->op = GF_FOP_LINK; - transaction_frame->local = local; + local->transaction.wind = afr_link_wind; + local->transaction.unwind = afr_link_unwind; - loc_copy (&local->loc, oldloc); - loc_copy (&local->newloc, newloc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + ret = afr_build_parent_loc(&local->transaction.parent_loc, newloc, + &op_errno); + if (ret) + goto out; - local->cont.link.ino = oldloc->inode->ino; + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(newloc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - if (oldloc->parent) - local->cont.link.parent_ino = newloc->parent->ino; - - local->transaction.fop = afr_link_wind; - local->transaction.done = afr_link_done; - local->transaction.unwind = afr_link_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, oldloc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (oldloc->path); - local->transaction.new_basename = AFR_BASENAME (newloc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (link, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); - } - - return 0; + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); + + AFR_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + return 0; } /* }}} */ /* {{{ symlink */ - int -afr_symlink_unwind (call_frame_t *frame, xlator_t *this) +afr_symlink_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - struct iatt *unwind_buf = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.symlink.read_child_buf.ia_ino) { - unwind_buf = &local->cont.symlink.read_child_buf; - } else { - unwind_buf = &local->cont.symlink.buf; - } - - unwind_buf->ia_ino = local->cont.symlink.ino; - - local->cont.symlink.preparent.ia_ino = local->cont.symlink.parent_ino; - local->cont.symlink.postparent.ia_ino = local->cont.symlink.parent_ino; - - AFR_STACK_UNWIND (symlink, main_frame, - local->op_ret, local->op_errno, - local->cont.symlink.inode, - unwind_buf, &local->cont.symlink.preparent, - &local->cont.symlink.postparent); - } + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - return 0; -} + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; -int -afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = -1; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) { - local->cont.symlink.buf = *buf; - local->cont.symlink.ino = - afr_itransform (buf->ia_ino, priv->child_count, - child_index); - - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - - if (child_index == local->first_up_child) { - local->cont.symlink.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - local->first_up_child); - } - - if (child_index == local->read_child_index) { - local->cont.symlink.read_child_buf = *buf; - local->cont.symlink.preparent = *preparent; - local->cont.symlink.postparent = *postparent; - } - - local->cont.symlink.inode = inode; - - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + AFR_STACK_UNWIND(symlink, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } - int -afr_symlink_wind (call_frame_t *frame, xlator_t *this) +afr_symlink_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->symlink, - local->cont.symlink.linkpath, - &local->loc, - local->cont.symlink.params); - - if (!--call_count) - break; - - } - } - - return 0; + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } - int -afr_symlink_done (call_frame_t *frame, xlator_t *this) +afr_symlink_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_symlink_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->symlink, + local->cont.symlink.linkpath, &local->loc, local->umask, + local->xdata_req); + return 0; } - int -afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, dict_t *params) +afr_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; - - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - transaction_frame->local = local; - - loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); - - local->cont.symlink.linkpath = gf_strdup (linkpath); - if (params) - local->cont.symlink.params = dict_ref (params); - - if (loc->parent) - local->cont.symlink.parent_ino = loc->parent->ino; - - local->transaction.fop = afr_symlink_wind; - local->transaction.done = afr_symlink_done; - local->transaction.unwind = afr_symlink_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, loc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - - op_ret = 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + local->cont.symlink.linkpath = gf_strdup(linkpath); + local->umask = umask; + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_SYMLINK; + local->transaction.wind = afr_symlink_wind; + local->transaction.unwind = afr_symlink_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (symlink, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); - } - - return 0; + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); + + AFR_STACK_UNWIND(symlink, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + return 0; } /* }}} */ @@ -1351,230 +939,118 @@ out: /* {{{ rename */ int -afr_rename_unwind (call_frame_t *frame, xlator_t *this) +afr_rename_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - struct iatt *unwind_buf = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.rename.read_child_buf.ia_ino) { - unwind_buf = &local->cont.rename.read_child_buf; - } else { - unwind_buf = &local->cont.rename.buf; - } - - unwind_buf->ia_ino = local->cont.rename.ino; - - local->cont.rename.preoldparent.ia_ino = local->cont.rename.oldparent_ino; - local->cont.rename.postoldparent.ia_ino = local->cont.rename.oldparent_ino; - local->cont.rename.prenewparent.ia_ino = local->cont.rename.newparent_ino; - local->cont.rename.postnewparent.ia_ino = local->cont.rename.newparent_ino; - - AFR_STACK_UNWIND (rename, main_frame, - local->op_ret, local->op_errno, - unwind_buf, - &local->cont.rename.preoldparent, - &local->cont.rename.postoldparent, - &local->cont.rename.prenewparent, - &local->cont.rename.postnewparent); - } - - return 0; -} + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + local = frame->local; -int -afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) -{ - afr_local_t * local = NULL; - - int call_count = -1; - int child_index = -1; - - local = frame->local; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - - if (buf) { - local->cont.rename.buf = *buf; - } - - local->success_count++; - } - - if (child_index == local->read_child_index) { - local->cont.rename.read_child_buf = *buf; - - local->cont.rename.preoldparent = *preoldparent; - local->cont.rename.postoldparent = *postoldparent; - local->cont.rename.prenewparent = *prenewparent; - local->cont.rename.postnewparent = *postnewparent; - } - } + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - local->transaction.resume (frame, this); - } - - return 0; + AFR_STACK_UNWIND(rename, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.buf, &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + &local->cont.dir_fop.prenewparent, + &local->cont.dir_fop.postnewparent, local->xdata_rsp); + return 0; } - -int32_t -afr_rename_wind (call_frame_t *frame, xlator_t *this) +int +afr_rename_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rename, - &local->loc, - &local->newloc); - if (!--call_count) - break; - } - } - - return 0; + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preoldparent, postoldparent, prenewparent, + postnewparent, xdata); } - int -afr_rename_done (call_frame_t *frame, xlator_t *this) +afr_rename_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = frame->local; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local->transaction.unwind (frame, this); + local = frame->local; + priv = this->private; - AFR_STACK_DESTROY (frame); - - return 0; + STACK_WIND_COOKIE(frame, afr_rename_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->rename, &local->loc, + &local->newloc, local->xdata_req); + return 0; } - int -afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) +afr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; - - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - transaction_frame->local = local; - - loc_copy (&local->loc, oldloc); - loc_copy (&local->newloc, newloc); - - local->read_child_index = afr_read_child (this, oldloc->inode); - - local->cont.rename.ino = oldloc->inode->ino; - - if (oldloc->parent) - local->cont.rename.oldparent_ino = oldloc->parent->ino; - if (newloc->parent) - local->cont.rename.newparent_ino = newloc->parent->ino; - - local->transaction.fop = afr_rename_wind; - local->transaction.done = afr_rename_done; - local->transaction.unwind = afr_rename_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, oldloc); - afr_build_parent_loc (&local->transaction.new_parent_loc, newloc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (oldloc->path); - local->transaction.new_basename = AFR_BASENAME (newloc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); - - op_ret = 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) { + op_errno = ENOMEM; + goto out; + } + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, oldloc); + loc_copy(&local->newloc, newloc); + + local->inode = inode_ref(oldloc->inode); + local->parent = inode_ref(oldloc->parent); + local->parent2 = inode_ref(newloc->parent); + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_RENAME; + local->transaction.wind = afr_rename_wind; + local->transaction.unwind = afr_rename_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, oldloc, + &op_errno); + if (ret) + goto out; + ret = afr_build_parent_loc(&local->transaction.new_parent_loc, newloc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(oldloc->path); + local->transaction.new_basename = AFR_BASENAME(newloc->path); + ret = afr_transaction(transaction_frame, this, + AFR_ENTRY_RENAME_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (rename, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; } /* }}} */ @@ -1582,412 +1058,205 @@ out: /* {{{ unlink */ int -afr_unlink_unwind (call_frame_t *frame, xlator_t *this) -{ - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - local->cont.unlink.preparent.ia_ino = local->cont.unlink.parent_ino; - local->cont.unlink.postparent.ia_ino = local->cont.unlink.parent_ino; - - AFR_STACK_UNWIND (unlink, main_frame, - local->op_ret, local->op_errno, - &local->cont.unlink.preparent, - &local->cont.unlink.postparent); - } - - return 0; -} - - -int -afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) +afr_unlink_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (child_index == local->read_child_index) { - local->read_child_returned = _gf_true; - } - - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.unlink.preparent = *preparent; - local->cont.unlink.postparent = *postparent; - } + local = frame->local; - if (child_index == local->read_child_index) { - local->cont.unlink.preparent = *preparent; - local->cont.unlink.postparent = *postparent; - } + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + AFR_STACK_UNWIND(unlink, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } - -int32_t -afr_unlink_wind (call_frame_t *frame, xlator_t *this) +int +afr_unlink_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->unlink, - &local->loc); - - if (!--call_count) - break; - } - } - - return 0; + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); } - -int32_t -afr_unlink_done (call_frame_t *frame, xlator_t *this) +int +afr_unlink_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = frame->local; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local->transaction.unwind (frame, this); + local = frame->local; + priv = this->private; - AFR_STACK_DESTROY (frame); - - return 0; + STACK_WIND_COOKIE(frame, afr_unlink_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->unlink, &local->loc, + local->xflag, local->xdata_req); + return 0; } - -int32_t -afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc) +int +afr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; - - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - transaction_frame->local = local; - - loc_copy (&local->loc, loc); - - if (loc->parent) - local->cont.unlink.parent_ino = loc->parent->ino; - - local->transaction.fop = afr_unlink_wind; - local->transaction.done = afr_unlink_done; - local->transaction.unwind = afr_unlink_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, loc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - - op_ret = 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + local->xflag = xflag; + + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_UNLINK; + local->transaction.wind = afr_unlink_wind; + local->transaction.unwind = afr_unlink_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (unlink, frame, op_ret, op_errno, - NULL, NULL); - } - - return 0; + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); + + AFR_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } /* }}} */ /* {{{ rmdir */ - - int -afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) +afr_rmdir_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - local->cont.rmdir.preparent.ia_ino = local->cont.rmdir.parent_ino; - local->cont.rmdir.postparent.ia_ino = local->cont.rmdir.parent_ino; - - AFR_STACK_UNWIND (rmdir, main_frame, - local->op_ret, local->op_errno, - &local->cont.rmdir.preparent, - &local->cont.rmdir.postparent); - } - - return 0; -} - - -int -afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = (long) cookie; - int read_child = 0; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - local = frame->local; - priv = this->private; + local = frame->local; - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; - if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.rmdir.preparent = *preparent; - local->cont.rmdir.postparent = *postparent; - - } - - if (child_index == read_child) { - local->cont.rmdir.preparent = *preparent; - local->cont.rmdir.postparent = *postparent; - } - - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - local->transaction.resume (frame, this); - } - - return 0; + AFR_STACK_UNWIND(rmdir, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; } - int -afr_rmdir_wind (call_frame_t *frame, xlator_t *this) +afr_rmdir_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rmdir, - &local->loc, local->cont.rmdir.flags); - - if (!--call_count) - break; - } - } - - return 0; + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); } - int -afr_rmdir_done (call_frame_t *frame, xlator_t *this) +afr_rmdir_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = frame->local; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local->transaction.unwind (frame, this); + local = frame->local; + priv = this->private; - AFR_STACK_DESTROY (frame); - - return 0; + STACK_WIND_COOKIE(frame, afr_rmdir_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->rmdir, &local->loc, + local->cont.rmdir.flags, local->xdata_req); + return 0; } - int -afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags) +afr_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; - - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - transaction_frame->local = local; - - local->cont.rmdir.flags = flags; - loc_copy (&local->loc, loc); - - if (loc->parent) - local->cont.rmdir.parent_ino = loc->parent->ino; - - local->transaction.fop = afr_rmdir_wind; - local->transaction.done = afr_rmdir_done; - local->transaction.unwind = afr_rmdir_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, loc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - - op_ret = 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + local->cont.rmdir.flags = flags; + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_RMDIR; + local->transaction.wind = afr_rmdir_wind; + local->transaction.unwind = afr_rmdir_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (rmdir, frame, op_ret, op_errno, - NULL, NULL); - } - - return 0; + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); + + AFR_STACK_UNWIND(rmdir, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } /* }}} */ - diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h index e589efa3794..1d88c3b9b26 100644 --- a/xlators/cluster/afr/src/afr-dir-write.h +++ b/xlators/cluster/afr/src/afr-dir-write.h @@ -1,60 +1,46 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __DIR_WRITE_H__ #define __DIR_WRITE_H__ int32_t -afr_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params); +afr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata); int32_t -afr_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev, dict_t *params); +afr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *xdata); int32_t -afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params); +afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata); int32_t -afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc); +afr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata); int32_t -afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags); +afr_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata); int32_t -afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); +afr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); int32_t -afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); +afr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); int -afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *oldloc, dict_t *params); - -int32_t -afr_setdents (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count); +afr_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *oldloc, mode_t umask, dict_t *params); #endif /* __DIR_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index d2089db8ed4..c5521704de2 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #include <libgen.h> #include <unistd.h> @@ -25,931 +15,1880 @@ #include <stdlib.h> #include <signal.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" - +#include <glusterfs/glusterfs.h> #include "afr.h" +#include <glusterfs/dict.h> +#include <glusterfs/logging.h> +#include <glusterfs/list.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/defaults.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include <glusterfs/quota-common-utils.h> + +#include "afr-transaction.h" +#include "afr-messages.h" - -/** - * Common algorithm for inode read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - * try the next child - * - * Applicable to: access, stat, fstat, readlink, getxattr - */ +/* + * Quota size xattrs are not maintained by afr. There is a + * possibility that they differ even when both the directory changelog xattrs + * suggest everything is fine. So if there is at least one 'source' check among + * the sources which has the maximum quota size. Otherwise check among all the + * available ones for maximum quota size. This way if there is a source and + * stale copies it always votes for the 'source'. + * */ + +int +afr_handle_quota_size(call_frame_t *frame, xlator_t *this) +{ + unsigned char *readable = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int i = 0; + int ret = 0; + quota_meta_t size = { + 0, + }; + quota_meta_t max_size = { + 0, + }; + int readable_cnt = 0; + int read_subvol = -1; + + local = frame->local; + priv = this->private; + replies = local->replies; + + readable = alloca0(priv->child_count); + + afr_inode_read_subvol_get(local->inode, this, readable, 0, 0); + + readable_cnt = AFR_COUNT(readable, priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + ret = quota_dict_get_meta(replies[i].xdata, QUOTA_SIZE_KEY, + SLEN(QUOTA_SIZE_KEY), &size); + if (ret == -1) + continue; + if (read_subvol == -1) + read_subvol = i; + if (size.size > max_size.size || + (size.file_count + size.dir_count) > + (max_size.file_count + max_size.dir_count)) + read_subvol = i; + + if (size.size > max_size.size) + max_size.size = size.size; + if (size.file_count > max_size.file_count) + max_size.file_count = size.file_count; + if (size.dir_count > max_size.dir_count) + max_size.dir_count = size.dir_count; + } + + if (max_size.size == 0 && max_size.file_count == 0 && + max_size.dir_count == 0) + return read_subvol; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + quota_dict_set_meta(replies[i].xdata, QUOTA_SIZE_KEY, &max_size, + IA_IFDIR); + } + + return read_subvol; +} /* {{{ access */ -int32_t -afr_access_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +int +afr_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_local_t *local = NULL; - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; + local = frame->local; - priv = this->private; - children = priv->children; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - local = frame->local; + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - read_child = (long) cookie; + AFR_STACK_UNWIND(access, frame, op_ret, op_errno, xdata); - if (op_ret == -1) { - retry: - last_tried = local->cont.access.last_tried; + return 0; +} - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.access.last_tried; +int +afr_access_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - if (this_try == read_child) { - goto retry; - } + priv = this->private; + local = frame->local; - unwind = 0; + if (subvol == -1) { + AFR_STACK_UNWIND(access, frame, local->op_ret, local->op_errno, 0); + return 0; + } - STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->access, - &local->loc, local->cont.access.mask); - } + STACK_WIND_COOKIE(frame, afr_access_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->access, &local->loc, + local->cont.access.mask, local->xdata_req); + return 0; +} +int +afr_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int mask, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int op_errno = 0; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_ACCESS; + loc_copy(&local->loc, loc); + local->cont.access.mask = mask; + if (xdata) + local->xdata_req = dict_ref(xdata); + + afr_read_txn(frame, this, loc->inode, afr_access_wind, + AFR_METADATA_TRANSACTION); + + return 0; out: - if (unwind) { - AFR_STACK_UNWIND (access, frame, op_ret, op_errno); - } + AFR_STACK_UNWIND(access, frame, -1, op_errno, NULL); - return 0; + return 0; } +/* }}} */ -int32_t -afr_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask) -{ - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; +/* {{{ stat */ - int32_t read_child = -1; +int +afr_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) +{ + afr_local_t *local = NULL; + local = frame->local; - int32_t op_ret = -1; - int32_t op_errno = 0; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + AFR_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata); - children = priv->children; + return 0; +} - ALLOC_OR_GOTO (local, afr_local_t, out); +int +afr_stat_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - read_child = afr_read_child (this, loc->inode); + priv = this->private; + local = frame->local; - if ((read_child >= 0) && (priv->child_up[read_child])) { - call_child = read_child; + if (subvol == -1) { + AFR_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno, 0, 0); + return 0; + } - local->cont.access.last_tried = -1; + STACK_WIND_COOKIE( + frame, afr_stat_cbk, (void *)(long)subvol, priv->children[subvol], + priv->children[subvol]->fops->stat, &local->loc, local->xdata_req); + return 0; +} - } else { - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } +int +afr_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + afr_local_t *local = NULL; + int op_errno = 0; - local->cont.access.last_tried = call_child; - } + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - loc_copy (&local->loc, loc); - local->cont.access.mask = mask; + local->op = GF_FOP_STAT; + loc_copy(&local->loc, loc); + if (xdata) + local->xdata_req = dict_ref(xdata); - STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) call_child, - children[call_child], children[call_child]->fops->access, - loc, mask); + afr_read_txn(frame, this, loc->inode, afr_stat_wind, AFR_DATA_TRANSACTION); - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (access, frame, op_ret, op_errno); - } - return 0; -} + AFR_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL); + return 0; +} /* }}} */ -/* {{{ stat */ +/* {{{ fstat */ -int32_t -afr_stat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf) +int +afr_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; - read_child = (long) cookie; + local = frame->local; - local = frame->local; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - if (op_ret == -1) { - retry: - last_tried = local->cont.stat.last_tried; - - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.stat.last_tried; + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - if (this_try == read_child) { - goto retry; - } + AFR_STACK_UNWIND(fstat, frame, op_ret, op_errno, buf, xdata); - unwind = 0; + return 0; +} - STACK_WIND_COOKIE (frame, afr_stat_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->stat, - &local->loc); - } +int +afr_fstat_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; -out: - if (unwind) { - if (buf) - buf->ia_ino = local->cont.stat.ino; + priv = this->private; + local = frame->local; - AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf); - } + if (subvol == -1) { + AFR_STACK_UNWIND(fstat, frame, local->op_ret, local->op_errno, 0, 0); + return 0; + } - return 0; + STACK_WIND_COOKIE( + frame, afr_fstat_cbk, (void *)(long)subvol, priv->children[subvol], + priv->children[subvol]->fops->fstat, local->fd, local->xdata_req); + return 0; } - int32_t -afr_stat (call_frame_t *frame, xlator_t *this, - loc_t *loc) +afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_local_t *local = NULL; + int op_errno = 0; - int32_t read_child = -1; - int call_child = 0; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - int32_t op_ret = -1; - int32_t op_errno = 0; + local->op = GF_FOP_FSTAT; + local->fd = fd_ref(fd); + if (xdata) + local->xdata_req = dict_ref(xdata); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_fix_open(fd, this); - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + afr_read_txn(frame, this, fd->inode, afr_fstat_wind, AFR_DATA_TRANSACTION); - children = priv->children; + return 0; +out: + AFR_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL); - ALLOC_OR_GOTO (local, afr_local_t, out); + return 0; +} - frame->local = local; +/* }}} */ - read_child = afr_read_child (this, loc->inode); +/* {{{ readlink */ - if ((read_child >= 0) && (priv->child_up[read_child])) { - call_child = read_child; +int +afr_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, const char *buf, + struct iatt *sbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; - local->cont.stat.last_tried = -1; + local = frame->local; - } else { - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - local->cont.stat.last_tried = call_child; - } + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - loc_copy (&local->loc, loc); + AFR_STACK_UNWIND(readlink, frame, op_ret, op_errno, buf, sbuf, xdata); + return 0; +} - local->cont.stat.ino = loc->inode->ino; +int +afr_readlink_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->stat, - loc); + local = frame->local; + priv = this->private; - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, NULL); - } + if (subvol == -1) { + AFR_STACK_UNWIND(readlink, frame, local->op_ret, local->op_errno, 0, 0, + 0); + return 0; + } - return 0; + STACK_WIND_COOKIE(frame, afr_readlink_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->readlink, &local->loc, + local->cont.readlink.size, local->xdata_req); + return 0; } +int +afr_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; -/* }}} */ + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; -/* {{{ fstat */ + local->op = GF_FOP_READLINK; + loc_copy(&local->loc, loc); + local->cont.readlink.size = size; + if (xdata) + local->xdata_req = dict_ref(xdata); -int32_t -afr_fstat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_read_txn(frame, this, loc->inode, afr_readlink_wind, + AFR_DATA_TRANSACTION); + + return 0; +out: + AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0); + + return 0; +} - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; +/* }}} */ - priv = this->private; - children = priv->children; +/* {{{ getxattr */ - local = frame->local; +struct _xattr_key { + char *key; + struct list_head list; +}; - read_child = (long) cookie; +int +__gather_xattr_keys(dict_t *dict, char *key, data_t *value, void *data) +{ + struct list_head *list = data; + struct _xattr_key *xkey = NULL; + + if (!strncmp(key, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) { + xkey = GF_MALLOC(sizeof(*xkey), gf_afr_mt_xattr_key); + if (!xkey) + return -1; - if (op_ret == -1) { - retry: - last_tried = local->cont.fstat.last_tried; + xkey->key = key; + INIT_LIST_HEAD(&xkey->list); - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.fstat.last_tried; + list_add_tail(&xkey->list, list); + } + return 0; +} - if (this_try == read_child) { - goto retry; - } +void +afr_filter_xattrs(dict_t *dict) +{ + struct list_head keys = { + 0, + }; + struct _xattr_key *key = NULL; + struct _xattr_key *tmp = NULL; - unwind = 0; + INIT_LIST_HEAD(&keys); - STACK_WIND_COOKIE (frame, afr_fstat_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->fstat, - local->fd); - } + dict_foreach(dict, __gather_xattr_keys, (void *)&keys); -out: - if (unwind) { - if (buf) - buf->ia_ino = local->cont.fstat.ino; + list_for_each_entry_safe(key, tmp, &keys, list) + { + dict_del(dict, key->key); - AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf); - } + list_del_init(&key->list); - return 0; + GF_FREE(key); + } } +static gf_boolean_t +afr_getxattr_ignorable_errnos(int32_t op_errno) +{ + if (op_errno == ENODATA || op_errno == ENOTSUP || op_errno == ERANGE || + op_errno == ENAMETOOLONG) + return _gf_true; -int32_t -afr_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd) + return _gf_false; +} +int +afr_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_local_t *local = NULL; - int call_child = 0; - int32_t read_child = -1; + local = frame->local; - int32_t op_ret = -1; - int32_t op_errno = 0; + if (op_ret < 0 && !afr_getxattr_ignorable_errnos(op_errno)) { + local->op_ret = op_ret; + local->op_errno = op_errno; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (this->private, out); + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + if (dict) + afr_filter_xattrs(dict); - children = priv->children; + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); - ALLOC_OR_GOTO (local, afr_local_t, out); + return 0; +} - frame->local = local; +int +afr_getxattr_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - VALIDATE_OR_GOTO (fd->inode, out); + local = frame->local; + priv = this->private; - read_child = afr_read_child (this, fd->inode); + if (subvol == -1) { + AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, NULL, + NULL); + return 0; + } - if ((read_child >= 0) && (priv->child_up[read_child])) { - call_child = read_child; + STACK_WIND_COOKIE(frame, afr_getxattr_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->getxattr, &local->loc, + local->cont.getxattr.name, local->xdata_req); + return 0; +} - local->cont.fstat.last_tried = -1; - } else { - call_child = afr_first_up_child (priv); +int32_t +afr_getxattr_unwind(call_frame_t *frame, int op_ret, int op_errno, dict_t *dict, + dict_t *xdata) - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } +{ + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} - local->cont.fstat.last_tried = call_child; +int32_t +afr_fgetxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + dict_t *xattr = NULL; + char *tmp_report = NULL; + char lk_summary[1024] = { + 0, + }; + int serz_len = 0; + int32_t callcnt = 0; + long int cky = 0; + int ret = 0; + int keylen = 0; + int children_keylen = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + cky = (long)cookie; + keylen = strlen(local->cont.getxattr.name); + children_keylen = strlen(children[cky]->name); + + LOCK(&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) + local->replies[cky].op_errno = op_errno; + + if (!local->dict) + local->dict = dict_new(); + if (local->dict) { + ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, + &tmp_report); + if (ret) + goto unlock; + ret = dict_set_dynstrn(local->dict, children[cky]->name, + children_keylen, gf_strdup(tmp_report)); + if (ret) + goto unlock; + } + } +unlock: + UNLOCK(&frame->lock); + + if (!callcnt) { + xattr = dict_new(); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + ret = dict_serialize_value_with_delim(local->dict, lk_summary, + &serz_len, '\n'); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + if (serz_len == -1) + snprintf(lk_summary, sizeof(lk_summary), "No locks cleared."); + ret = dict_set_dynstrn(xattr, local->cont.getxattr.name, keylen, + gf_strdup(lk_summary)); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_SET_FAILED, + "Error setting dictionary"); + goto unwind; } - local->cont.fstat.ino = fd->inode->ino; - local->fd = fd_ref (fd); - - STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->fstat, - fd); + op_errno = afr_final_errno(local, priv); - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, NULL); - } + unwind: + AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, xattr, xdata); + if (xattr) + dict_unref(xattr); + } - return 0; + return ret; } -/* }}} */ - -/* {{{ readlink */ - int32_t -afr_readlink_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - const char *buf, struct iatt *sbuf) +afr_getxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + dict_t *xattr = NULL; + char *tmp_report = NULL; + char lk_summary[1024] = { + 0, + }; + int serz_len = 0; + int32_t callcnt = 0; + long int cky = 0; + int ret = 0; + int keylen = 0; + int children_keylen = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + cky = (long)cookie; + + keylen = strlen(local->cont.getxattr.name); + children_keylen = strlen(children[cky]->name); + + LOCK(&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) + local->replies[cky].op_errno = op_errno; + + if (!local->dict) + local->dict = dict_new(); + if (local->dict) { + ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, + &tmp_report); + if (ret) + goto unlock; + ret = dict_set_dynstrn(local->dict, children[cky]->name, + children_keylen, gf_strdup(tmp_report)); + if (ret) + goto unlock; + } + } +unlock: + UNLOCK(&frame->lock); + + if (!callcnt) { + xattr = dict_new(); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + ret = dict_serialize_value_with_delim(local->dict, lk_summary, + &serz_len, '\n'); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + if (serz_len == -1) + snprintf(lk_summary, sizeof(lk_summary), "No locks cleared."); + ret = dict_set_dynstrn(xattr, local->cont.getxattr.name, keylen, + gf_strdup(lk_summary)); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_SET_FAILED, + "Error setting dictionary"); + goto unwind; + } - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; + op_errno = afr_final_errno(local, priv); - priv = this->private; - children = priv->children; + unwind: + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata); - local = frame->local; + if (xattr) + dict_unref(xattr); + } - read_child = (long) cookie; + return ret; +} - if (op_ret == -1) { - retry: - last_tried = local->cont.readlink.last_tried; +/** + * node-uuid cbk uses next child querying mechanism + */ +int32_t +afr_getxattr_node_uuid_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int unwind = 1; + int curr_call_child = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { /** query the _next_ child */ + + /** + * _current_ becomes _next_ + * If done with all children and yet no success; give up ! + */ + curr_call_child = (int)((long)cookie); + if (++curr_call_child == priv->child_count) + goto unwind; + + gf_msg_debug(this->name, op_errno, + "op_ret (-1): Re-querying afr-child (%d/%d)", + curr_call_child, priv->child_count); + + unwind = 0; + STACK_WIND_COOKIE( + frame, afr_getxattr_node_uuid_cbk, (void *)(long)curr_call_child, + children[curr_call_child], + children[curr_call_child]->fops->getxattr, &local->loc, + local->cont.getxattr.name, local->xdata_req); + } + +unwind: + if (unwind) + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + + return 0; +} - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.readlink.last_tried; +/** + * list-node-uuids cbk returns the list of node_uuids for the subvolume. + */ +int32_t +afr_getxattr_list_node_uuids_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr_serz = NULL; + long cky = 0; + int32_t tlen = 0; + + local = frame->local; + priv = this->private; + cky = (long)cookie; + + LOCK(&frame->lock); + { + callcnt = --local->call_count; + local->replies[cky].valid = 1; + local->replies[cky].op_ret = op_ret; + local->replies[cky].op_errno = op_errno; + + if (op_ret < 0) + goto unlock; + + local->op_ret = 0; + + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); + local->replies[cky].xattr = dict_ref(dict); + } + +unlock: + UNLOCK(&frame->lock); + + if (!callcnt) { + if (local->op_ret != 0) { + /* All bricks gave an error. */ + local->op_errno = afr_final_errno(local, priv); + goto unwind; + } - if (this_try == read_child) { - goto retry; - } + /*Since we store the UUID0_STR as node uuid for down bricks and + *for non zero op_ret, assigning length to priv->child_count + *number of uuids*/ + local->cont.getxattr.xattr_len = (SLEN(UUID0_STR) + 2) * + priv->child_count; + + if (!local->dict) + local->dict = dict_new(); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } - unwind = 0; - STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->readlink, - &local->loc, - local->cont.readlink.size); - } + xattr_serz = GF_CALLOC(local->cont.getxattr.xattr_len, sizeof(char), + gf_common_mt_char); -out: - if (unwind) { - if (sbuf) - sbuf->ia_ino = local->cont.readlink.ino; + if (!xattr_serz) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } - AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf); - } + ret = afr_serialize_xattrs_with_delimiter(frame, this, xattr_serz, + UUID0_STR, &tlen, ' '); + if (ret) { + local->op_ret = -1; + local->op_errno = ENOMEM; + GF_FREE(xattr_serz); + goto unwind; + } + ret = dict_set_dynstr_sizen(local->dict, GF_XATTR_LIST_NODE_UUIDS_KEY, + xattr_serz); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Cannot set node_uuid key in dict"); + local->op_ret = -1; + local->op_errno = ENOMEM; + if (ret == -EINVAL) + GF_FREE(xattr_serz); + } else { + local->op_ret = local->cont.getxattr.xattr_len - 1; + local->op_errno = 0; + } - return 0; + unwind: + AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, + local->dict, local->xdata_rsp); + } + + return ret; } +int32_t +afr_getxattr_quota_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + int idx = (long)cookie; + int call_count = 0; + afr_local_t *local = frame->local; + int read_subvol = -1; + + local->replies[idx].valid = 1; + local->replies[idx].op_ret = op_ret; + local->replies[idx].op_errno = op_errno; + if (dict) + local->replies[idx].xdata = dict_ref(dict); + call_count = afr_frame_return(frame); + if (call_count == 0) { + local->inode = inode_ref(local->loc.inode); + read_subvol = afr_handle_quota_size(frame, this); + if (read_subvol != -1) { + op_ret = local->replies[read_subvol].op_ret; + op_errno = local->replies[read_subvol].op_errno; + dict = local->replies[read_subvol].xdata; + } + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + } + + return 0; +} int32_t -afr_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size) +afr_getxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; + int call_cnt = 0, len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; - int32_t read_child = -1; + LOCK(&frame->lock); + { + local = frame->local; - int32_t op_ret = -1; - int32_t op_errno = 0; + call_cnt = --local->call_count; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + if ((op_ret < 0) || (!dict && !xdata)) { + goto unlock; + } - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + if (xdata) { + if (!local->xdata_rsp) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } - children = priv->children; + if (!dict) { + goto unlock; + } - ALLOC_OR_GOTO (local, afr_local_t, out); + op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); - frame->local = local; + if (!lockinfo_buf) { + goto unlock; + } - read_child = afr_read_child (this, loc->inode); + if (!local->dict) { + local->dict = dict_new(); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } +unlock: + UNLOCK(&frame->lock); + + if (lockinfo_buf != NULL) { + lockinfo = dict_new(); + if (lockinfo == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } else { + op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo); - if ((read_child >= 0) && (priv->child_up[read_child])) { - call_child = read_child; + if (lockinfo && local->dict) { + dict_copy(lockinfo, local->dict); + } + } + } + + if (xdata && local->xdata_rsp) { + dict_copy(xdata, local->xdata_rsp); + } + + if (!call_cnt) { + newdict = dict_new(); + if (!newdict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } - local->cont.readlink.last_tried = -1; + op_ret = dict_allocate_and_serialize( + local->dict, (char **)&lockinfo_buf, (unsigned int *)&len); + if (op_ret != 0) { + local->op_ret = -1; + goto unwind; + } - } else { - call_child = afr_first_up_child (priv); + op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto unwind; + } + + unwind: + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, newdict, + local->xdata_rsp); + } + + dict_unref(lockinfo); + + return 0; +} + +int32_t +afr_fgetxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + int call_cnt = 0, len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; + + LOCK(&frame->lock); + { + local = frame->local; - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; + call_cnt = --local->call_count; + + if ((op_ret < 0) || (!dict && !xdata)) { + goto unlock; + } + + if (xdata) { + if (!local->xdata_rsp) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; } + } + } - local->cont.readlink.last_tried = call_child; + if (!dict) { + goto unlock; } - loc_copy (&local->loc, loc); + op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); - local->cont.readlink.size = size; - local->cont.readlink.ino = loc->inode->ino; + if (!lockinfo_buf) { + goto unlock; + } - STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) call_child, - children[call_child], children[call_child]->fops->readlink, - loc, size); + if (!local->dict) { + local->dict = dict_new(); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } +unlock: + UNLOCK(&frame->lock); + + if (lockinfo_buf != NULL) { + lockinfo = dict_new(); + if (lockinfo == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } else { + op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo); - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, NULL, NULL); - } - return 0; -} + if (lockinfo && local->dict) { + dict_copy(lockinfo, local->dict); + } + } + } + + if (xdata && local->xdata_rsp) { + dict_copy(xdata, local->xdata_rsp); + } + + if (!call_cnt) { + newdict = dict_new(); + if (!newdict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + op_ret = dict_allocate_and_serialize( + local->dict, (char **)&lockinfo_buf, (unsigned int *)&len); + if (op_ret != 0) { + local->op_ret = -1; + goto unwind; + } -/* }}} */ + op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto unwind; + } -/* {{{ getxattr */ + unwind: + AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, newdict, + local->xdata_rsp); + } -struct _xattr_key { - char *key; - struct list_head list; -}; + dict_unref(lockinfo); + return 0; +} -void -__gather_xattr_keys (dict_t *dict, char *key, data_t *value, - void *data) +int32_t +afr_fgetxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - struct list_head * list = data; - struct _xattr_key * xkey = NULL; + afr_local_t *local = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr = NULL; + char *xattr_serz = NULL; + int keylen = 0; + char xattr_cky[1024] = { + 0, + }; + int xattr_cky_len = 0; + dict_t *nxattr = NULL; + long cky = 0; + int32_t padding = 0; + int32_t tlen = 0; + + if (!frame || !frame->local || !this) { + gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref"); + goto out; + } + + local = frame->local; + cky = (long)cookie; + keylen = strlen(local->cont.getxattr.name); + xattr_cky_len = snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld", + local->cont.getxattr.name, cky); + LOCK(&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret < 0) { + local->op_errno = op_errno; + } else { + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); + } - if (!strncmp (key, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { + if (!dict || (op_ret < 0)) + goto unlock; - xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); - if (!xkey) - return; + if (!local->dict) { + local->dict = dict_new(); + if (!local->dict) + goto unlock; + } + ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, &xattr); + if (ret) + goto unlock; + + xattr = gf_strdup(xattr); + + ret = dict_set_dynstrn(local->dict, xattr_cky, xattr_cky_len, xattr); + if (ret) { + UNLOCK(&frame->lock); + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Cannot set xattr cookie key"); + goto post_unlock; + } - xkey->key = key; - INIT_LIST_HEAD (&xkey->list); + local->cont.getxattr.xattr_len += strlen(xattr) + 1; + } +unlock: + UNLOCK(&frame->lock); +post_unlock: + if (!callcnt) { + if (!local->cont.getxattr.xattr_len) + goto unwind; + + nxattr = dict_new(); + if (!nxattr) + goto unwind; + + /* extra bytes for decorations (brackets and <>'s) */ + padding += strlen(this->name) + SLEN(AFR_PATHINFO_HEADER) + 4; + local->cont.getxattr.xattr_len += (padding + 2); + + xattr_serz = GF_MALLOC(local->cont.getxattr.xattr_len, + gf_common_mt_char); + + if (!xattr_serz) + goto unwind; + + /* the xlator info */ + int xattr_serz_len = sprintf( + xattr_serz, "(<" AFR_PATHINFO_HEADER "%s> ", this->name); + + /* actual series of pathinfo */ + ret = dict_serialize_value_with_delim( + local->dict, xattr_serz + xattr_serz_len, &tlen, ' '); + if (ret) { + GF_FREE(xattr_serz); + goto unwind; + } - list_add_tail (&xkey->list, list); + /* closing part */ + *(xattr_serz + padding + tlen) = ')'; + *(xattr_serz + padding + tlen + 1) = '\0'; + + ret = dict_set_dynstrn(nxattr, local->cont.getxattr.name, keylen, + xattr_serz); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Cannot set pathinfo key in dict"); + if (ret == -EINVAL) + GF_FREE(xattr_serz); } -} + unwind: + AFR_STACK_UNWIND(fgetxattr, frame, local->op_ret, local->op_errno, + nxattr, local->xdata_rsp); -void -__filter_xattrs (dict_t *dict) -{ - struct list_head keys; + if (nxattr) + dict_unref(nxattr); + } - struct _xattr_key *key; - struct _xattr_key *tmp; +out: + return ret; +} - INIT_LIST_HEAD (&keys); +int32_t +afr_getxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr = NULL; + char *xattr_serz = NULL; + char xattr_cky[1024] = { + 0, + }; + int keylen = 0; + int xattr_cky_len = 0; + dict_t *nxattr = NULL; + long cky = 0; + int32_t padding = 0; + int32_t tlen = 0; + + if (!frame || !frame->local || !this) { + gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref"); + goto out; + } + + local = frame->local; + cky = (long)cookie; + keylen = strlen(local->cont.getxattr.name); + xattr_cky_len = snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld", + local->cont.getxattr.name, cky); + LOCK(&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret < 0) { + local->op_errno = op_errno; + } else { + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); + } - dict_foreach (dict, __gather_xattr_keys, - (void *) &keys); + if (!dict || (op_ret < 0)) + goto unlock; - list_for_each_entry_safe (key, tmp, &keys, list) { - dict_del (dict, key->key); + if (!local->dict) { + local->dict = dict_new(); + if (!local->dict) + goto unlock; + } + ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, &xattr); + if (ret) + goto unlock; + + xattr = gf_strdup(xattr); + + ret = dict_set_dynstrn(local->dict, xattr_cky, xattr_cky_len, xattr); + if (ret) { + UNLOCK(&frame->lock); + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Cannot set xattr cookie key"); + goto post_unlock; + } - list_del_init (&key->list); + local->cont.getxattr.xattr_len += strlen(xattr) + 1; + } +unlock: + UNLOCK(&frame->lock); +post_unlock: + if (!callcnt) { + if (!local->cont.getxattr.xattr_len) + goto unwind; + + nxattr = dict_new(); + if (!nxattr) + goto unwind; + + /* extra bytes for decorations (brackets and <>'s) */ + padding += strlen(this->name) + SLEN(AFR_PATHINFO_HEADER) + 4; + local->cont.getxattr.xattr_len += (padding + 2); + + xattr_serz = GF_MALLOC(local->cont.getxattr.xattr_len, + gf_common_mt_char); + + if (!xattr_serz) + goto unwind; + + /* the xlator info */ + int xattr_serz_len = sprintf( + xattr_serz, "(<" AFR_PATHINFO_HEADER "%s> ", this->name); + + /* actual series of pathinfo */ + ret = dict_serialize_value_with_delim( + local->dict, xattr_serz + xattr_serz_len, &tlen, ' '); + if (ret) { + GF_FREE(xattr_serz); + goto unwind; + } - GF_FREE (key); + /* closing part */ + *(xattr_serz + padding + tlen) = ')'; + *(xattr_serz + padding + tlen + 1) = '\0'; + + ret = dict_set_dynstrn(nxattr, local->cont.getxattr.name, keylen, + xattr_serz); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Cannot set pathinfo key in dict"); + if (ret == -EINVAL) + GF_FREE(xattr_serz); } + + unwind: + AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, + nxattr, local->xdata_rsp); + + if (nxattr) + dict_unref(nxattr); + } + +out: + return ret; } +static int +afr_aggregate_stime_xattr(dict_t *this, char *key, data_t *value, void *data) +{ + int ret = 0; + + if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) + ret = gf_get_max_stime(THIS, data, key, value); + return ret; +} int32_t -afr_getxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) +afr_common_getxattr_stime_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_local_t *local = NULL; + int32_t callcnt = 0; - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; + if (!frame || !frame->local || !this) { + gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref"); + goto out; + } - priv = this->private; - children = priv->children; + local = frame->local; - local = frame->local; + LOCK(&frame->lock); + { + callcnt = --local->call_count; - read_child = (long) cookie; + if (!dict || (op_ret < 0)) { + local->op_errno = op_errno; + goto cleanup; + } - if (op_ret == -1) { - retry: - last_tried = local->cont.getxattr.last_tried; + if (!local->dict) + local->dict = dict_copy_with_ref(dict, NULL); + else + dict_foreach(dict, afr_aggregate_stime_xattr, local->dict); + local->op_ret = 0; + } - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.getxattr.last_tried; +cleanup: + UNLOCK(&frame->lock); - if (this_try == read_child) { - goto retry; - } - - unwind = 0; - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->getxattr, - &local->loc, - local->cont.getxattr.name); - } + if (!callcnt) { + AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, + local->dict, xdata); + } out: - if (unwind) { - if (op_ret >= 0 && dict) - __filter_xattrs (dict); + return 0; +} - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); - } +static gf_boolean_t +afr_is_special_xattr(const char *name, fop_getxattr_cbk_t *cbk, + gf_boolean_t is_fgetxattr) +{ + gf_boolean_t is_spl = _gf_true; + + GF_ASSERT(cbk); + if (!cbk || !name) { + is_spl = _gf_false; + goto out; + } + + if (!strcmp(name, GF_XATTR_PATHINFO_KEY) || + !strcmp(name, GF_XATTR_USER_PATHINFO_KEY)) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_pathinfo_cbk; + } else { + *cbk = afr_getxattr_pathinfo_cbk; + } + } else if (!strncmp(name, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD))) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_clrlk_cbk; + } else { + *cbk = afr_getxattr_clrlk_cbk; + } + } else if (!strncmp(name, GF_XATTR_LOCKINFO_KEY, + SLEN(GF_XATTR_LOCKINFO_KEY))) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_lockinfo_cbk; + } else { + *cbk = afr_getxattr_lockinfo_cbk; + } + } else if (fnmatch(GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) { + *cbk = afr_common_getxattr_stime_cbk; + } else if (strcmp(name, QUOTA_SIZE_KEY) == 0) { + *cbk = afr_getxattr_quota_size_cbk; + } else if (!strcmp(name, GF_XATTR_LIST_NODE_UUIDS_KEY)) { + *cbk = afr_getxattr_list_node_uuids_cbk; + } else { + is_spl = _gf_false; + } - return 0; +out: + return is_spl; } -int32_t -afr_getxattr_unwind (call_frame_t *frame, - int op_ret, int op_errno, dict_t *dict) +static void +afr_getxattr_all_subvols(xlator_t *this, call_frame_t *frame, const char *name, + loc_t *loc, fop_getxattr_cbk_t cbk) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + + priv = this->private; + + local = frame->local; + // local->call_count set in afr_local_init + call_count = local->call_count; + + if (!strcmp(name, GF_XATTR_LIST_NODE_UUIDS_KEY)) { + GF_FREE(local->cont.getxattr.name); + local->cont.getxattr.name = gf_strdup(GF_XATTR_NODE_UUID_KEY); + } + + // If up-children count is 0, afr_local_init would have failed already + // and the call would have unwound so not handling it here. + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE(frame, cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->getxattr, loc, + local->cont.getxattr.name, NULL); + if (!--call_count) + break; + } + } + return; +} +int +afr_marker_populate_args(call_frame_t *frame, int type, int *gauge, + xlator_t **subvols) { - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); - return 0; + xlator_t *this = frame->this; + afr_private_t *priv = this->private; + + memcpy(subvols, priv->children, sizeof(*subvols) * priv->child_count); + + if (type == MARKER_XTIME_TYPE) { + /*Don't error out on ENOENT/ENOTCONN */ + gauge[MCNT_NOTFOUND] = 0; + gauge[MCNT_ENOTCONN] = 0; + } + return priv->child_count; +} + +static int +afr_handle_heal_xattrs(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *heal_op) +{ + int ret = -1; + afr_spb_status_t *data = NULL; + + if (!strcmp(heal_op, GF_HEAL_INFO)) { + afr_get_heal_info(frame, this, loc); + ret = 0; + goto out; + } + + if (!strcmp(heal_op, GF_AFR_HEAL_SBRAIN)) { + afr_heal_splitbrain_file(frame, this, loc); + ret = 0; + goto out; + } + + if (!strcmp(heal_op, GF_AFR_SBRAIN_STATUS)) { + data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_spb_status_t); + if (!data) { + ret = 1; + goto out; + } + data->frame = frame; + data->loc = loc; + ret = synctask_new(this->ctx->env, afr_get_split_brain_status, + afr_get_split_brain_status_cbk, NULL, data); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS, + "Failed to create" + " synctask. Unable to fetch split-brain status" + " for %s.", + loc->name); + ret = 1; + goto out; + } + goto out; + } + +out: + if (ret == 1) { + AFR_STACK_UNWIND(getxattr, frame, -1, ENOMEM, NULL, NULL); + if (data) + GF_FREE(data); + ret = 0; + } + return ret; } int32_t -afr_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) +afr_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t * local = NULL; - xlator_list_t * trav = NULL; - xlator_t ** sub_volumes= NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int i = 0; + int32_t op_errno = 0; + int ret = -1; + fop_getxattr_cbk_t cbk = NULL; - int read_child = -1; - int i = 0; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - int32_t op_ret = -1; - int32_t op_errno = 0; + priv = this->private; + children = priv->children; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + loc_copy(&local->loc, loc); - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + local->op = GF_FOP_GETXATTR; - children = priv->children; + if (xdata) + local->xdata_req = dict_ref(xdata); - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; + if (!name) + goto no_name; - loc_copy (&local->loc, loc); - if (name) - local->cont.getxattr.name = gf_strdup (name); + local->cont.getxattr.name = gf_strdup(name); + if (!local->cont.getxattr.name) { + op_errno = ENOMEM; + goto out; + } - if (name) { - if (!strncmp (name, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { + if (!strncmp(name, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) { + op_errno = ENODATA; + goto out; + } - op_errno = ENODATA; - goto out; - } - if ((strcmp (GF_XATTR_MARKER_KEY, name) == 0) - && (-1 == frame->root->pid)) { + if (cluster_handle_marker_getxattr(frame, loc, name, priv->vol_uuid, + afr_getxattr_unwind, + afr_marker_populate_args) == 0) + return 0; - local->marker.call_count = priv->child_count; + ret = afr_handle_heal_xattrs(frame, this, &local->loc, name); + if (ret == 0) + return 0; - sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *)); - for (i = 0, trav = this->children; trav ; - trav = trav->next, i++) { + /* + * Heal daemons don't have IO threads ... and as a result they + * send this getxattr down and eventually crash :( + */ + op_errno = -1; + GF_CHECK_XATTR_KEY_AND_GOTO(name, IO_THREADS_QUEUE_SIZE_KEY, op_errno, out); + + /* + * Special xattrs which need responses from all subvols + */ + if (afr_is_special_xattr(name, &cbk, 0)) { + afr_getxattr_all_subvols(this, frame, name, loc, cbk); + return 0; + } - *(sub_volumes + i) = trav->xlator; - } + if (XATTR_IS_NODE_UUID(name)) { + i = 0; + STACK_WIND_COOKIE(frame, afr_getxattr_node_uuid_cbk, (void *)(long)i, + children[i], children[i]->fops->getxattr, loc, name, + xdata); + return 0; + } - if (cluster_getmarkerattr (frame, this, loc, name, - local, afr_getxattr_unwind, - sub_volumes, - priv->child_count, - MARKER_UUID_TYPE, - priv->vol_uuid)) { +no_name: - op_errno = EINVAL; - goto out; - } + afr_read_txn(frame, this, local->loc.inode, afr_getxattr_wind, + AFR_METADATA_TRANSACTION); - return 0; - } + ret = 0; +out: + if (ret < 0) + AFR_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL); + return 0; +} - if (*priv->vol_uuid) { - if ((match_uuid_local (name, priv->vol_uuid) == 0) - && (-1 == frame->root->pid)) { +/* {{{ fgetxattr */ - local->marker.call_count = priv->child_count; +int32_t +afr_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; - sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *)); - for (i = 0, trav = this->children; trav ; - trav = trav->next, i++) { + local = frame->local; - *(sub_volumes + i) = trav->xlator; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - } + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - if (cluster_getmarkerattr (frame, this, loc, - name, local, - afr_getxattr_unwind, - sub_volumes, - priv->child_count, - MARKER_XTIME_TYPE, - priv->vol_uuid)) { - op_errno = EINVAL; - goto out; - } + if (dict) + afr_filter_xattrs(dict); - return 0; - } - } + AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata); - } + return 0; +} - read_child = afr_read_child (this, loc->inode); +int +afr_fgetxattr_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - if ((read_child >= 0) && (priv->child_up[read_child])) { - call_child = read_child; + local = frame->local; + priv = this->private; - local->cont.getxattr.last_tried = -1; - } else { - call_child = afr_first_up_child (priv); + if (subvol == -1) { + AFR_STACK_UNWIND(fgetxattr, frame, local->op_ret, local->op_errno, NULL, + NULL); + return 0; + } - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } + STACK_WIND_COOKIE(frame, afr_fgetxattr_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fgetxattr, local->fd, + local->cont.getxattr.name, local->xdata_req); + return 0; +} - local->cont.getxattr.last_tried = call_child; +static void +afr_fgetxattr_all_subvols(xlator_t *this, call_frame_t *frame, + fop_fgetxattr_cbk_t cbk) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + + priv = this->private; + + local = frame->local; + // local->call_count set in afr_local_init + call_count = local->call_count; + + // If up-children count is 0, afr_local_init would have failed already + // and the call would have unwound so not handling it here. + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE(frame, cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->fgetxattr, local->fd, + local->cont.getxattr.name, NULL); + if (!--call_count) + break; } + } + return; +} - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) call_child, - children[call_child], children[call_child]->fops->getxattr, - loc, name); +int +afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; + fop_fgetxattr_cbk_t cbk = NULL; + + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_FGETXATTR; + local->fd = fd_ref(fd); + if (name) { + local->cont.getxattr.name = gf_strdup(name); + if (!local->cont.getxattr.name) { + op_errno = ENOMEM; + goto out; + } + } + if (xdata) + local->xdata_req = dict_ref(xdata); + + /* pathinfo gets handled only in getxattr(), but we need to handle + * lockinfo. + * If we are doing fgetxattr with lockinfo as the key then we + * collect information from all children. + */ + if (afr_is_special_xattr(name, &cbk, 1)) { + afr_fgetxattr_all_subvols(this, frame, cbk); + return 0; + } + + afr_fix_open(fd, this); + + afr_read_txn(frame, this, fd->inode, afr_fgetxattr_wind, + AFR_METADATA_TRANSACTION); - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, NULL); - } - return 0; -} + AFR_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL); + return 0; +} /* }}} */ /* {{{ readv */ -/** - * read algorithm: - * - * if the user has specified a read subvolume, use it - * otherwise - - * use the inode number to hash it to one of the subvolumes, and - * read from there (to balance read load) - * - * if any of the above read's fail, try the children in sequence - * beginning at the beginning - */ - -int32_t -afr_readv_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref) +int +afr_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, int32_t count, + struct iatt *buf, struct iobref *iobref, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_local_t *local = NULL; - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; + local = frame->local; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } + + AFR_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, buf, iobref, + xdata); + return 0; +} - children = priv->children; +int +afr_readv_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; + local = frame->local; + priv = this->private; - read_child = (long) cookie; + if (subvol == -1) { + AFR_STACK_UNWIND(readv, frame, local->op_ret, local->op_errno, 0, 0, 0, + 0, 0); + return 0; + } - if (op_ret == -1) { - retry: - last_tried = local->cont.readv.last_tried; + STACK_WIND_COOKIE( + frame, afr_readv_cbk, (void *)(long)subvol, priv->children[subvol], + priv->children[subvol]->fops->readv, local->fd, local->cont.readv.size, + local->cont.readv.offset, local->cont.readv.flags, local->xdata_req); + return 0; +} - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.readv.last_tried; +int +afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; - if (this_try == read_child) { - /* - skip the read child since if we are here - we must have already tried that child - */ - goto retry; - } + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - unwind = 0; + local->op = GF_FOP_READ; + local->fd = fd_ref(fd); + local->cont.readv.size = size; + local->cont.readv.offset = offset; + local->cont.readv.flags = flags; + if (xdata) + local->xdata_req = dict_ref(xdata); - STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->readv, - local->fd, local->cont.readv.size, - local->cont.readv.offset); - } + afr_fix_open(fd, this); -out: - if (unwind) { - if (buf && local) - buf->ia_ino = local->cont.readv.ino; + afr_read_txn(frame, this, fd->inode, afr_readv_wind, AFR_DATA_TRANSACTION); - AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, - vector, count, buf, iobref); - } + return 0; +out: + AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0); - return 0; + return 0; } +/* }}} */ -int32_t -afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset) +/* {{{ seek */ + +int +afr_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, off_t offset, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_local_t *local = NULL; - int32_t read_child = -1; - int call_child = 0; + local = frame->local; - int32_t op_ret = -1; - int32_t op_errno = 0; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (fd, out); + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - priv = this->private; - children = priv->children; + AFR_STACK_UNWIND(seek, frame, op_ret, op_errno, offset, xdata); + return 0; +} - ALLOC_OR_GOTO (local, afr_local_t, out); +int +afr_seek_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - frame->local = local; + local = frame->local; + priv = this->private; - read_child = afr_read_child (this, fd->inode); + if (subvol == -1) { + AFR_STACK_UNWIND(seek, frame, local->op_ret, local->op_errno, 0, NULL); + return 0; + } - if ((read_child >= 0) && (priv->child_up[read_child])) { - call_child = read_child; + STACK_WIND_COOKIE( + frame, afr_seek_cbk, (void *)(long)subvol, priv->children[subvol], + priv->children[subvol]->fops->seek, local->fd, local->cont.seek.offset, + local->cont.seek.what, local->xdata_req); + return 0; +} - /* - if read fails from the read child, we try - all children starting with the first one - */ - local->cont.readv.last_tried = -1; +int +afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; - } else { - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } - - local->cont.readv.last_tried = call_child; - } - - local->fd = fd_ref (fd); - - local->cont.readv.ino = fd->inode->ino; - local->cont.readv.size = size; - local->cont.readv.offset = offset; - - STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readv, - fd, size, offset); - - op_ret = 0; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_SEEK; + local->fd = fd_ref(fd); + local->cont.seek.offset = offset; + local->cont.seek.what = what; + if (xdata) + local->xdata_req = dict_ref(xdata); + + afr_fix_open(fd, this); + + afr_read_txn(frame, this, fd->inode, afr_seek_wind, AFR_DATA_TRANSACTION); + + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, NULL, 0, NULL, - NULL); - } - return 0; -} + AFR_STACK_UNWIND(seek, frame, -1, op_errno, 0, NULL); + return 0; +} /* }}} */ diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h index acc814fb7dc..8c982bc7e6f 100644 --- a/xlators/cluster/afr/src/afr-inode-read.h +++ b/xlators/cluster/afr/src/afr-inode-read.h @@ -1,47 +1,45 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __INODE_READ_H__ #define __INODE_READ_H__ int32_t -afr_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask); +afr_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata); int32_t -afr_stat (call_frame_t *frame, xlator_t *this, - loc_t *loc); +afr_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); int32_t -afr_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd); +afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); int32_t -afr_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size); +afr_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata); int32_t -afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); +afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); int32_t -afr_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name); +afr_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata); +int32_t +afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata); + +int +afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata); +int +afr_handle_quota_size(call_frame_t *frame, xlator_t *this); #endif /* __INODE_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 050a4f0e9ef..1d6e4f3570a 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -1,1594 +1,2565 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ -#include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" - +#include <glusterfs/glusterfs.h> #include "afr.h" +#include <glusterfs/dict.h> +#include <glusterfs/logging.h> +#include <glusterfs/defaults.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include "protocol-common.h" +#include <glusterfs/byte-order.h> #include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-messages.h" -/* {{{ writev */ +static void +__afr_inode_write_finalize(call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int ret = 0; + int read_subvol = 0; + struct iatt *stbuf = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_read_subvol_args_t args = { + 0, + }; + + local = frame->local; + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, local->inode, out); + + /*This code needs to stay till DHT sends fops on linked + * inodes*/ + if (!inode_is_linked(local->inode)) { + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == -1) + continue; + if (!gf_uuid_is_null(local->replies[i].poststat.ia_gfid)) { + gf_uuid_copy(args.gfid, local->replies[i].poststat.ia_gfid); + args.ia_type = local->replies[i].poststat.ia_type; + break; + } else { + ret = dict_get_bin(local->replies[i].xdata, + DHT_IATT_IN_XDATA_KEY, (void **)&stbuf); + if (ret) + continue; + gf_uuid_copy(args.gfid, stbuf->ia_gfid); + args.ia_type = stbuf->ia_type; + break; + } + } + } + + if (local->transaction.type == AFR_METADATA_TRANSACTION) { + read_subvol = afr_metadata_subvol_get(local->inode, this, NULL, + local->readable, NULL, &args); + } else { + read_subvol = afr_data_subvol_get(local->inode, this, NULL, + local->readable, NULL, &args); + } + + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + afr_pick_error_xdata(local, priv, local->inode, local->readable, NULL, + NULL); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0) + continue; + + /* Order of checks in the compound conditional + below is important. + + - Highest precedence: largest op_ret + - Next precedence: if all op_rets are equal, read subvol + - Least precedence: any succeeded subvol + */ + if ((local->op_ret < local->replies[i].op_ret) || + ((local->op_ret == local->replies[i].op_ret) && + (i == read_subvol))) { + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + + local->cont.inode_wfop.prebuf = local->replies[i].prestat; + local->cont.inode_wfop.postbuf = local->replies[i].poststat; + + if (local->replies[i].xdata) { + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); + local->xdata_rsp = dict_ref(local->replies[i].xdata); + } + if (local->replies[i].xattr) { + if (local->xattr_rsp) + dict_unref(local->xattr_rsp); + local->xattr_rsp = dict_ref(local->replies[i].xattr); + } + } + } + + afr_set_in_flight_sb_status(this, frame, local->inode); +out: + return; +} -int -afr_writev_unwind (call_frame_t *frame, xlator_t *this) +static void +__afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xattr, dict_t *xdata) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + local->replies[child_index].valid = 1; + + if (AFR_IS_ARBITER_BRICK(priv, child_index) && op_ret == 1) + op_ret = iov_length(local->cont.writev.vector, + local->cont.writev.count); + + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (xdata) + local->replies[child_index].xdata = dict_ref(xdata); + + if (op_ret >= 0) { + if (prebuf) + local->replies[child_index].prestat = *prebuf; + if (postbuf) + local->replies[child_index].poststat = *postbuf; + if (xattr) + local->replies[child_index].xattr = dict_ref(xattr); + } else { + afr_transaction_fop_failed(frame, this, child_index); + } + + return; +} - local = frame->local; +static int +__afr_inode_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xattr, dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = (long)cookie; + int call_count = -1; + afr_private_t *priv = NULL; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + priv = this->private; + local = frame->local; - if (main_frame) { - local->cont.writev.prebuf.ia_ino = local->cont.writev.ino; - local->cont.writev.postbuf.ia_ino = local->cont.writev.ino; + LOCK(&frame->lock); + { + __afr_inode_write_fill(frame, this, child_index, op_ret, op_errno, + prebuf, postbuf, xattr, xdata); + call_count = --local->call_count; + } + UNLOCK(&frame->lock); - AFR_STACK_UNWIND (writev, main_frame, - local->op_ret, local->op_errno, - &local->cont.writev.prebuf, - &local->cont.writev.postbuf); + if (call_count == 0) { + __afr_inode_write_finalize(frame, this); + + if (afr_txn_nothing_failed(frame, this)) { + /*if it did pre-op, it will do post-op changing ctime*/ + if (priv->consistent_metadata && afr_needs_changelog_update(local)) + afr_zero_fill_stat(local); + local->transaction.unwind(frame, this); } - return 0; + + afr_transaction_resume(frame, this); + } + + return 0; } +/* {{{ writev */ -int -afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +void +afr_writev_copy_outvars(call_frame_t *src_frame, call_frame_t *dst_frame) { - afr_local_t * local = NULL; - - int child_index = (long) cookie; - int call_count = -1; - int read_child = 0; + afr_local_t *src_local = NULL; + afr_local_t *dst_local = NULL; + + src_local = src_frame->local; + dst_local = dst_frame->local; + + dst_local->op_ret = src_local->op_ret; + dst_local->op_errno = src_local->op_errno; + dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; + dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; + if (src_local->xdata_rsp) + dst_local->xdata_rsp = dict_ref(src_local->xdata_rsp); +} - local = frame->local; +void +afr_writev_unwind(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = this->private; - read_child = afr_read_child (this, local->fd->inode); + local = frame->local; - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } + if (priv->consistent_metadata) + afr_zero_fill_stat(local); - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); + AFR_STACK_UNWIND(writev, frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); +} - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.writev.prebuf = *prebuf; - local->cont.writev.postbuf = *postbuf; - } +int +afr_transaction_writev_unwind(call_frame_t *frame, xlator_t *this) +{ + call_frame_t *fop_frame = NULL; - if (child_index == read_child) { - local->cont.writev.prebuf = *prebuf; - local->cont.writev.postbuf = *postbuf; - } - } + fop_frame = afr_transaction_detach_fop_frame(frame); - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); + if (fop_frame) { + afr_writev_copy_outvars(frame, fop_frame); + afr_writev_unwind(fop_frame, this); + } + return 0; +} - if (call_count == 0) { - local->transaction.unwind (frame, this); +static void +afr_writev_handle_short_writes(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + /* + * We already have the best case result of the writev calls staged + * as the return value. Any writev that returns some value less + * than the best case is now out of sync, so mark the fop as + * failed. Note that fops that have returned with errors have + * already been marked as failed. + */ + for (i = 0; i < priv->child_count; i++) { + if ((!local->replies[i].valid) || (local->replies[i].op_ret == -1)) + continue; + + if (local->replies[i].op_ret < local->op_ret) + afr_transaction_fop_failed(frame, this, i); + } +} - local->transaction.resume (frame, this); - } - return 0; +void +afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int ret = 0; + afr_local_t *local = frame->local; + uint32_t open_fd_count = 0; + uint32_t write_is_append = 0; + int32_t num_inodelks = 0; + + LOCK(&frame->lock); + { + __afr_inode_write_fill(frame, this, child_index, op_ret, op_errno, + prebuf, postbuf, NULL, xdata); + if (op_ret == -1 || !xdata) + goto unlock; + + write_is_append = 0; + ret = dict_get_uint32(xdata, GLUSTERFS_WRITE_IS_APPEND, + &write_is_append); + if (ret || !write_is_append) + local->append_write = _gf_false; + + ret = dict_get_uint32(xdata, GLUSTERFS_ACTIVE_FD_COUNT, &open_fd_count); + if (ret < 0) + goto unlock; + if (open_fd_count > local->open_fd_count) { + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; + } + + ret = dict_get_int32_sizen(xdata, GLUSTERFS_INODELK_COUNT, + &num_inodelks); + if (ret < 0) + goto unlock; + if (num_inodelks > local->num_inodelks) { + local->num_inodelks = num_inodelks; + local->update_num_inodelks = _gf_true; + } + } +unlock: + UNLOCK(&frame->lock); } +void +afr_process_post_writev(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_lock_t *lock = NULL; + + local = frame->local; + + if (!local->stable_write && !local->append_write) + /* An appended write removes the necessity to + fsync() the file. This is because self-heal + has the logic to check for larger file when + the xattrs are not reliably pointing at + a stale file. + */ + afr_fd_report_unstable_write(this, local); + + __afr_inode_write_finalize(frame, this); + + afr_writev_handle_short_writes(frame, this); + + if (local->update_open_fd_count) + local->inode_ctx->open_fd_count = local->open_fd_count; + if (local->update_num_inodelks && + local->transaction.type == AFR_DATA_TRANSACTION) { + lock = &local->inode_ctx->lock[local->transaction.type]; + lock->num_inodelks = local->num_inodelks; + } +} int -afr_writev_wind (call_frame_t *frame, xlator_t *this) +afr_writev_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + call_frame_t *fop_frame = NULL; + int child_index = (long)cookie; + int call_count = -1; - int i = 0; - int call_count = -1; + afr_inode_write_fill(frame, this, child_index, op_ret, op_errno, prebuf, + postbuf, xdata); - local = frame->local; - priv = this->private; + call_count = afr_frame_return(frame); - call_count = afr_up_children_count (priv->child_count, local->child_up); + if (call_count == 0) { + afr_process_post_writev(frame, this); - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->writev, - local->fd, - local->cont.writev.vector, - local->cont.writev.count, - local->cont.writev.offset, - local->cont.writev.iobref); - - if (!--call_count) - break; - } - } - - return 0; + if (!afr_txn_nothing_failed(frame, this)) { + // Don't unwind until post-op is complete + afr_transaction_resume(frame, this); + } else { + /* + * Generally inode-write fops do transaction.unwind then + * transaction.resume, but writev needs to make sure that + * delayed post-op frame is placed in fdctx before unwind + * happens. This prevents the race of flush doing the + * changelog wakeup first in fuse thread and then this + * writev placing its delayed post-op frame in fdctx. + * This helps flush make sure all the delayed post-ops are + * completed. + */ + + fop_frame = afr_transaction_detach_fop_frame(frame); + afr_writev_copy_outvars(frame, fop_frame); + afr_transaction_resume(frame, this); + afr_writev_unwind(fop_frame, this); + } + } + return 0; } +static int +afr_arbiter_writev_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + static char byte = 0xFF; + static struct iovec vector = {&byte, 1}; + int32_t count = 1; + + STACK_WIND_COOKIE( + frame, afr_writev_wind_cbk, (void *)(long)subvol, + priv->children[subvol], priv->children[subvol]->fops->writev, local->fd, + &vector, count, local->cont.writev.offset, local->cont.writev.flags, + local->cont.writev.iobref, local->xdata_req); + + return 0; +} int -afr_writev_done (call_frame_t *frame, xlator_t *this) +afr_writev_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - - local = frame->local; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - iobref_unref (local->cont.writev.iobref); - local->cont.writev.iobref = NULL; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); + local = frame->local; + priv = this->private; + if (AFR_IS_ARBITER_BRICK(priv, subvol)) { + afr_arbiter_writev_wind(frame, this, subvol); return 0; + } + + STACK_WIND_COOKIE(frame, afr_writev_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->writev, local->fd, + local->cont.writev.vector, local->cont.writev.count, + local->cont.writev.offset, local->cont.writev.flags, + local->cont.writev.iobref, local->xdata_req); + return 0; } +int +afr_do_writev(call_frame_t *frame, xlator_t *this) +{ + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = frame->local; + transaction_frame->local = local; + frame->local = NULL; + + if (!AFR_FRAME_INIT(frame, op_errno)) + goto out; + + local->op = GF_FOP_WRITE; + + local->transaction.wind = afr_writev_wind; + local->transaction.unwind = afr_transaction_writev_unwind; + + local->transaction.main_frame = frame; + + if (local->fd->flags & O_APPEND) { + /* + * Backend vfs ignores the 'offset' for append mode fd so + * locking just the region provided for the writev does not + * give consistency guarantee. The actual write may happen at a + * completely different range than the one provided by the + * offset, len in the fop. So lock the entire file. + */ + local->transaction.start = 0; + local->transaction.len = 0; + } else { + local->transaction.start = local->cont.writev.offset; + local->transaction.len = iov_length(local->cont.writev.vector, + local->cont.writev.count); + } + + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); + + AFR_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} int -afr_do_writev (call_frame_t *frame, xlator_t *this) +afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; + afr_local_t *local = NULL; + int op_errno = ENOMEM; + int ret = -1; + + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->cont.writev.vector = iov_dup(vector, count); + if (!local->cont.writev.vector) + goto out; + local->cont.writev.count = count; + local->cont.writev.offset = offset; + local->cont.writev.flags = flags; + local->cont.writev.iobref = iobref_ref(iobref); + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; + + if (dict_set_uint32(local->xdata_req, GLUSTERFS_ACTIVE_FD_COUNT, 4)) { + op_errno = ENOMEM; + goto out; + } + + if (dict_set_str_sizen(local->xdata_req, GLUSTERFS_INODELK_DOM_COUNT, + this->name)) { + op_errno = ENOMEM; + goto out; + } + + if (dict_set_uint32(local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) { + op_errno = ENOMEM; + goto out; + } + + /* Set append_write to be true speculatively. If on any + server it turns not be true, we unset it in the + callback. + */ + local->append_write = _gf_true; + + /* detect here, but set it in writev_wind_cbk *after* the unstable + write is performed + */ + local->stable_write = !!((fd->flags | flags) & (O_SYNC | O_DSYNC)); + + afr_fix_open(fd, this); + + afr_do_writev(frame, this); + + return 0; +out: + AFR_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL); - int op_ret = -1; - int op_errno = 0; + return 0; +} - local = frame->local; +/* }}} */ - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - op_errno = ENOMEM; - goto out; - } +/* {{{ truncate */ - transaction_frame->local = local; - frame->local = NULL; +int +afr_truncate_unwind(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local->op = GF_FOP_WRITE; + local = frame->local; - local->success_count = 0; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; - local->transaction.fop = afr_writev_wind; - local->transaction.done = afr_writev_done; - local->transaction.unwind = afr_writev_unwind; + AFR_STACK_UNWIND(truncate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} - local->transaction.main_frame = frame; - if (local->fd->flags & O_APPEND) { - local->transaction.start = 0; - local->transaction.len = 0; - } else { - local->transaction.start = local->cont.writev.offset; - local->transaction.len = iov_length (local->cont.writev.vector, - local->cont.writev.count); - } +int +afr_truncate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + local = frame->local; - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL); - } + if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; - return 0; + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); } - int -afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) +afr_truncate_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - int ret = -1; + local = frame->local; + priv = this->private; - int op_ret = -1; - int op_errno = 0; + STACK_WIND_COOKIE(frame, afr_truncate_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->truncate, &local->loc, + local->cont.truncate.offset, local->xdata_req); + return 0; +} - uint64_t ctx; - afr_fd_ctx_t *fd_ctx = NULL; +int +afr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - priv = this->private; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - ALLOC_OR_GOTO (local, afr_local_t, out); + local->cont.truncate.offset = offset; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + if (!local->xdata_req) + goto out; - frame->local = local; + local->transaction.wind = afr_truncate_wind; + local->transaction.unwind = afr_truncate_unwind; - local->cont.writev.vector = iov_dup (vector, count); - local->cont.writev.count = count; - local->cont.writev.offset = offset; - local->cont.writev.ino = fd->inode->ino; - local->cont.writev.iobref = iobref_ref (iobref); + loc_copy(&local->loc, loc); + ret = afr_set_inode_local(this, local, loc->inode); + if (ret) + goto out; - local->fd = fd_ref (fd); + local->op = GF_FOP_TRUNCATE; - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - goto out; - } + local->transaction.main_frame = frame; + local->transaction.start = offset; + local->transaction.len = 0; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + /* Set it true speculatively, will get reset in afr_truncate_wind_cbk + if truncate was not a NOP */ + local->stable_write = _gf_true; - if (fd_ctx->up_count < priv->up_count) { - local->openfd_flush_cbk = afr_do_writev; - afr_openfd_flush (frame, this, fd); - } else { - afr_do_writev (frame, this); - } + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - /* }}} */ -/* {{{ truncate */ +/* {{{ ftruncate */ int -afr_truncate_unwind (call_frame_t *frame, xlator_t *this) +afr_ftruncate_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - local->cont.truncate.prebuf.ia_ino = local->cont.truncate.ino; - local->cont.truncate.postbuf.ia_ino = local->cont.truncate.ino; - - AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, - local->op_errno, - &local->cont.truncate.prebuf, - &local->cont.truncate.postbuf); - } + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(ftruncate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} int -afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +afr_ftruncate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_read_child (this, local->loc.inode); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - if (afr_fop_failed (op_ret, op_errno) && op_errno != EFBIG) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.truncate.prebuf = *prebuf; - local->cont.truncate.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.truncate.prebuf = *prebuf; - local->cont.truncate.postbuf = *postbuf; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; - call_count = afr_frame_return (frame); + local = frame->local; - if (call_count == 0) { - local->transaction.resume (frame, this); - } + if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; - return 0; + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); } - -int32_t -afr_truncate_wind (call_frame_t *frame, xlator_t *this) +int +afr_ftruncate_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->truncate, - &local->loc, - local->cont.truncate.offset); - - if (!--call_count) - break; - } - } + local = frame->local; + priv = this->private; - return 0; + STACK_WIND_COOKIE(frame, afr_ftruncate_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->ftruncate, local->fd, + local->cont.ftruncate.offset, local->xdata_req); + return 0; } - int -afr_truncate_done (call_frame_t *frame, xlator_t *this) +afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local->transaction.unwind (frame, this); + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - AFR_STACK_DESTROY (frame); - - return 0; -} + local->cont.ftruncate.offset = offset; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + if (!local->xdata_req) + goto out; -int -afr_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - int ret = -1; + local->op = GF_FOP_FTRUNCATE; - int op_ret = -1; - int op_errno = 0; + local->transaction.wind = afr_ftruncate_wind; + local->transaction.unwind = afr_ftruncate_unwind; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local->transaction.main_frame = frame; - priv = this->private; + local->transaction.start = local->cont.ftruncate.offset; + local->transaction.len = 0; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } + afr_fix_open(fd, this); - ALLOC_OR_GOTO (local, afr_local_t, out); + /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk + if truncate was not a NOP */ + local->stable_write = _gf_true; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - transaction_frame->local = local; + return 0; +out: + AFR_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL); - local->op_ret = -1; + return 0; +} - local->cont.truncate.offset = offset; - local->cont.truncate.ino = loc->inode->ino; +/* }}} */ - local->transaction.fop = afr_truncate_wind; - local->transaction.done = afr_truncate_done; - local->transaction.unwind = afr_truncate_unwind; +/* {{{ setattr */ - loc_copy (&local->loc, loc); +int +afr_setattr_unwind(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local->transaction.main_frame = frame; - local->transaction.start = 0; - local->transaction.len = offset; + local = frame->local; - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (truncate, frame, op_ret, op_errno, NULL, NULL); - } + AFR_STACK_UNWIND(setattr, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} - return 0; +int +afr_setattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preop, + struct iatt *postop, dict_t *xdata) +{ + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, preop, + postop, NULL, xdata); } +int +afr_setattr_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_setattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->setattr, &local->loc, + &local->cont.setattr.in_buf, local->cont.setattr.valid, + local->xdata_req); + return 0; +} -/* }}} */ +int +afr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, + int32_t valid, dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + local->cont.setattr.in_buf = *buf; + local->cont.setattr.valid = valid; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_setattr_wind; + local->transaction.unwind = afr_setattr_unwind; + + loc_copy(&local->loc, loc); + ret = afr_set_inode_local(this, local, loc->inode); + if (ret) + goto out; + + local->op = GF_FOP_SETATTR; + + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; + + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); -/* {{{ ftruncate */ + AFR_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} +/* {{{ fsetattr */ int -afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) +afr_fsetattr_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; + local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; - if (main_frame) { - local->cont.ftruncate.prebuf.ia_ino = local->cont.ftruncate.ino; - local->cont.ftruncate.postbuf.ia_ino = local->cont.ftruncate.ino; + AFR_STACK_UNWIND(fsetattr, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} - AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, - local->op_errno, - &local->cont.ftruncate.prebuf, - &local->cont.ftruncate.postbuf); - } - return 0; +int +afr_fsetattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preop, + struct iatt *postop, dict_t *xdata) +{ + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, preop, + postop, NULL, xdata); } +int +afr_fsetattr_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_fsetattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsetattr, local->fd, + &local->cont.fsetattr.in_buf, local->cont.fsetattr.valid, + local->xdata_req); + return 0; +} int -afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, + int32_t valid, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 0; - int read_child = 0; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local = frame->local; - priv = this->private; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - read_child = afr_read_child (this, local->fd->inode); + local->cont.fsetattr.in_buf = *buf; + local->cont.fsetattr.valid = valid; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } + if (!local->xdata_req) + goto out; - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); + local->transaction.wind = afr_fsetattr_wind; + local->transaction.unwind = afr_fsetattr_unwind; - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.ftruncate.prebuf = *prebuf; - local->cont.ftruncate.postbuf = *postbuf; - } + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - if (child_index == read_child) { - local->cont.ftruncate.prebuf = *prebuf; - local->cont.ftruncate.postbuf = *postbuf; - } + local->op = GF_FOP_FSETATTR; - local->success_count++; + afr_fix_open(fd, this); - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - if (need_unwind) - local->transaction.unwind (frame, this); + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - call_count = afr_frame_return (frame); + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - if (call_count == 0) { - local->transaction.resume (frame, this); - } + AFR_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +/* {{{ setxattr */ + +int +afr_setxattr_unwind(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + local = frame->local; + + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; + + AFR_STACK_UNWIND(setxattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; } +int +afr_setxattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, NULL, xdata); +} int -afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) +afr_setxattr_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_setxattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->setxattr, &local->loc, + local->cont.setxattr.dict, local->cont.setxattr.flags, + local->xdata_req); + return 0; +} - int call_count = -1; - int i = 0; +int +afr_emptyb_set_pending_changelog_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) - local = frame->local; - priv = this->private; +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i, ret = 0; + char *op_type = NULL; - call_count = afr_up_children_count (priv->child_count, local->child_up); + local = frame->local; + priv = this->private; + i = (long)cookie; - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; - local->call_count = call_count; + ret = dict_get_str_sizen(local->xdata_req, "replicate-brick-op", &op_type); + if (ret) + goto out; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ftruncate, - local->fd, local->cont.ftruncate.offset); - - if (!--call_count) - break; - } - } + gf_smsg(this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO, + op_ret ? op_errno : 0, AFR_MSG_SET_PEND_XATTR, "name=%s", + priv->children[i]->name, "op_ret=%s", + op_ret ? "failed" : "succeeded", NULL); - return 0; +out: + syncbarrier_wake(&local->barrier); + return 0; } - int -afr_ftruncate_done (call_frame_t *frame, xlator_t *this) +afr_emptyb_set_pending_changelog(call_frame_t *frame, xlator_t *this, + unsigned char *locked_nodes) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int ret = 0, i = 0; - local = frame->local; + local = frame->local; + priv = this->private; - local->transaction.unwind (frame, this); + AFR_ONLIST(locked_nodes, frame, afr_emptyb_set_pending_changelog_cbk, + xattrop, &local->loc, GF_XATTROP_ADD_ARRAY, local->xattr_req, + NULL); - AFR_STACK_DESTROY (frame); + /* It is sufficient if xattrop was successful on one child */ + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; - return 0; + if (local->replies[i].op_ret == 0) { + ret = 0; + goto out; + } else { + ret = afr_higher_errno(ret, local->replies[i].op_errno); + } + } +out: + return -ret; } +static int +_afr_handle_empty_brick_type(xlator_t *this, call_frame_t *frame, loc_t *loc, + int empty_index, afr_transaction_type type, + char *op_type, const int op_type_len) +{ + int count = 0; + int ret = -ENOMEM; + int idx = -1; + int d_idx = -1; + unsigned char *locked_nodes = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + locked_nodes = alloca0(priv->child_count); + + idx = afr_index_for_transaction_type(type); + d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION); + + local->pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!local->pending) + goto out; + + local->pending[empty_index][idx] = hton32(1); + + if ((priv->esh_granular) && (type == AFR_ENTRY_TRANSACTION)) + local->pending[empty_index][d_idx] = hton32(1); + + local->xdata_req = dict_new(); + if (!local->xdata_req) + goto out; + + ret = dict_set_nstrn(local->xdata_req, "replicate-brick-op", + SLEN("replicate-brick-op"), op_type, op_type_len); + if (ret) + goto out; + + local->xattr_req = dict_new(); + if (!local->xattr_req) + goto out; + + ret = afr_set_pending_dict(priv, local->xattr_req, local->pending); + if (ret < 0) + goto out; + + if (AFR_ENTRY_TRANSACTION == type) { + count = afr_selfheal_entrylk(frame, this, loc->inode, this->name, NULL, + locked_nodes); + } else { + count = afr_selfheal_inodelk(frame, this, loc->inode, this->name, + LLONG_MAX - 1, 0, locked_nodes); + } + + if (!count) { + gf_smsg(this->name, GF_LOG_ERROR, EAGAIN, AFR_MSG_REPLACE_BRICK_STATUS, + NULL); + ret = -EAGAIN; + goto unlock; + } + + ret = afr_emptyb_set_pending_changelog(frame, this, locked_nodes); + if (ret) + goto unlock; + ret = 0; +unlock: + if (AFR_ENTRY_TRANSACTION == type) { + afr_selfheal_unentrylk(frame, this, loc->inode, this->name, NULL, + locked_nodes, NULL); + } else { + afr_selfheal_uninodelk(frame, this, loc->inode, this->name, + LLONG_MAX - 1, 0, locked_nodes); + } +out: + return ret; +} -int -afr_do_ftruncate (call_frame_t *frame, xlator_t *this) +void +afr_brick_args_cleanup(void *opaque) { - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; + afr_empty_brick_args_t *data = NULL; - int op_ret = -1; - int op_errno = 0; + data = opaque; + loc_wipe(&data->loc); + GF_FREE(data); +} - local = frame->local; +int +_afr_handle_empty_brick_cbk(int ret, call_frame_t *frame, void *opaque) +{ + afr_brick_args_cleanup(opaque); + return 0; +} - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } +int +_afr_handle_empty_brick(void *opaque) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int empty_index = -1; + int ret = -1; + int op_errno = ENOMEM; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + char *op_type = NULL; + int op_type_len = 0; + afr_empty_brick_args_t *data = NULL; + call_frame_t *op_frame = NULL; + + data = opaque; + frame = data->frame; + empty_index = data->empty_index; + if (!data->op_type) + goto out; + + op_frame = copy_frame(frame); + if (!op_frame) { + ret = -1; + op_errno = ENOMEM; + goto out; + } + + op_type = data->op_type; + op_type_len = strlen(op_type); + this = op_frame->this; + priv = this->private; + + afr_set_lk_owner(op_frame, this, op_frame->root); + local = AFR_FRAME_INIT(op_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, &data->loc); + + gf_smsg(this->name, GF_LOG_INFO, 0, AFR_MSG_NEW_BRICK, "name=%s", + priv->children[empty_index]->name, NULL); + + ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index, + AFR_METADATA_TRANSACTION, op_type, + op_type_len); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + + dict_unref(local->xdata_req); + dict_unref(local->xattr_req); + afr_matrix_cleanup(local->pending, priv->child_count); + local->pending = NULL; + local->xattr_req = NULL; + local->xdata_req = NULL; + + ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index, + AFR_ENTRY_TRANSACTION, op_type, + op_type_len); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + ret = 0; +out: + if (op_frame) { + AFR_STACK_DESTROY(op_frame); + } + AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL); + return 0; +} - transaction_frame->local = local; - frame->local = NULL; +int +afr_split_brain_resolve_do(call_frame_t *frame, xlator_t *this, loc_t *loc, + char *data) +{ + afr_local_t *local = NULL; + int ret = -1; + int op_errno = EINVAL; + + local = frame->local; + local->xdata_req = dict_new(); + + if (!local->xdata_req) { + op_errno = ENOMEM; + goto out; + } + + ret = dict_set_int32_sizen(local->xdata_req, "heal-op", + GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + ret = dict_set_str_sizen(local->xdata_req, "child-name", data); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + /* set spb choice to -1 whether heal succeeds or not: + * If heal succeeds : spb-choice should be set to -1 as + * it is no longer valid; file is not + * in split-brain anymore. + * If heal doesn't succeed: + * spb-choice should be set to -1 + * otherwise reads will be served + * from spb-choice which is misleading. + */ + ret = afr_inode_split_brain_choice_set(loc->inode, this, -1); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_SET_FAILED, + NULL); + afr_heal_splitbrain_file(frame, this, loc); + ret = 0; +out: + if (ret < 0) + AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL); + return 0; +} - local->op = GF_FOP_FTRUNCATE; +int +afr_get_split_brain_child_index(xlator_t *this, void *value, size_t len) +{ + int spb_child_index = -1; + char *spb_child_str = NULL; - local->transaction.fop = afr_ftruncate_wind; - local->transaction.done = afr_ftruncate_done; - local->transaction.unwind = afr_ftruncate_unwind; + spb_child_str = alloca0(len + 1); + memcpy(spb_child_str, value, len); - local->transaction.main_frame = frame; + if (!strcmp(spb_child_str, "none")) + return -2; - local->transaction.start = 0; - local->transaction.len = local->cont.ftruncate.offset; + spb_child_index = afr_get_child_index_from_name(this, spb_child_str); + if (spb_child_index < 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, + "subvol=%s", spb_child_str, NULL); + } + return spb_child_index; +} - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); +int +afr_can_set_split_brain_choice(void *opaque) +{ + afr_spbc_timeout_t *data = opaque; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + loc_t *loc = NULL; + int ret = -1; + + frame = data->frame; + loc = data->loc; + this = frame->this; + + ret = afr_is_split_brain(frame, this, loc->inode, loc->gfid, &data->d_spb, + &data->m_spb); + + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, "gfid=%s", + uuid_utoa(loc->gfid), NULL); + return ret; +} - op_ret = 0; +int +afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc, + dict_t *dict) +{ + void *choice_value = NULL; + void *resolve_value = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_spbc_timeout_t *data = NULL; + int len = 0; + int spb_child_index = -1; + int ret = -1; + int op_errno = EINVAL; + + priv = this->private; + + ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_CHOICE, &choice_value, &len); + ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_RESOLVE, &resolve_value, + &len); + if (!choice_value && !resolve_value) { + ret = -1; + goto out; + } + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) { + ret = 1; + goto out; + } + + local->op = GF_FOP_SETXATTR; + + if (choice_value) { + spb_child_index = afr_get_split_brain_child_index(this, choice_value, + len); + if (spb_child_index < 0) { + /* Case where value was "none" */ + if (spb_child_index == -2) + spb_child_index = -1; + else { + ret = 1; + op_errno = EINVAL; + goto out; + } + } + + data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_spbc_timeout_t); + if (!data) { + ret = 1; + goto out; + } + data->spb_child_index = spb_child_index; + data->frame = frame; + loc_copy(&local->loc, loc); + data->loc = &local->loc; + ret = synctask_new(this->ctx->env, afr_can_set_split_brain_choice, + afr_set_split_brain_choice, NULL, data); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS, + "name=%s", loc->name, NULL); + ret = 1; + op_errno = ENOMEM; + goto out; + } + ret = 0; + goto out; + } + + if (resolve_value) { + spb_child_index = afr_get_split_brain_child_index(this, resolve_value, + len); + if (spb_child_index < 0) { + ret = 1; + goto out; + } + + afr_split_brain_resolve_do(frame, this, loc, + priv->children[spb_child_index]->name); + ret = 0; + } out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, NULL); - } - - return 0; + /* key was correct but value was invalid when ret == 1 */ + if (ret == 1) { + AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL); + if (data) + GF_FREE(data); + ret = 0; + } + return ret; } - int -afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset) +afr_handle_spb_choice_timeout(xlator_t *this, call_frame_t *frame, dict_t *dict) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + uint64_t timeout = 0; + afr_private_t *priv = NULL; + + priv = this->private; - int ret = -1; + ret = dict_get_uint64(dict, GF_AFR_SPB_CHOICE_TIMEOUT, &timeout); + if (!ret) { + priv->spb_choice_timeout = timeout * 60; + AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL); + } - int op_ret = -1; - int op_errno = 0; + return ret; +} - uint64_t ctx; - afr_fd_ctx_t *fd_ctx = NULL; +int +afr_handle_empty_brick(xlator_t *this, call_frame_t *frame, loc_t *loc, + dict_t *dict) +{ + int ret = -1; + int ab_ret = -1; + int empty_index = -1; + int op_errno = EPERM; + char *empty_brick = NULL; + char *op_type = NULL; + afr_empty_brick_args_t *data = NULL; + + ret = dict_get_str_sizen(dict, GF_AFR_REPLACE_BRICK, &empty_brick); + if (!ret) + op_type = GF_AFR_REPLACE_BRICK; + + ab_ret = dict_get_str_sizen(dict, GF_AFR_ADD_BRICK, &empty_brick); + if (!ab_ret) + op_type = GF_AFR_ADD_BRICK; + + if (ret && ab_ret) + goto out; + + if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) { + gf_smsg(this->name, GF_LOG_ERROR, EPERM, AFR_MSG_INTERNAL_ATTR, + "op_type=%s", op_type, NULL); + ret = 1; + goto out; + } + empty_index = afr_get_child_index_from_name(this, empty_brick); + + if (empty_index < 0) { + /* Didn't belong to this replica pair + * Just do a no-op + */ + AFR_STACK_UNWIND(setxattr, frame, 0, 0, NULL); + return 0; + } else { + data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_empty_brick_t); + if (!data) { + ret = 1; + op_errno = ENOMEM; + goto out; + } + data->frame = frame; + loc_copy(&data->loc, loc); + data->empty_index = empty_index; + data->op_type = op_type; + ret = synctask_new(this->ctx->env, _afr_handle_empty_brick, + _afr_handle_empty_brick_cbk, NULL, data); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS, + NULL); + ret = 1; + op_errno = ENOMEM; + afr_brick_args_cleanup(data); + goto out; + } + } + ret = 0; +out: + if (ret == 1) { + AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL); + ret = 0; + } + return ret; +} - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); +static int +afr_handle_special_xattr(xlator_t *this, call_frame_t *frame, loc_t *loc, + dict_t *dict) +{ + int ret = -1; - priv = this->private; + ret = afr_handle_split_brain_commands(this, frame, loc, dict); + if (ret == 0) + goto out; - ALLOC_OR_GOTO (local, afr_local_t, out); + ret = afr_handle_spb_choice_timeout(this, frame, dict); + if (ret == 0) + goto out; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + /* Applicable for replace-brick and add-brick commands */ + ret = afr_handle_empty_brick(this, frame, loc, dict); +out: + return ret; +} - frame->local = local; +int +afr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = EINVAL; - local->cont.ftruncate.offset = offset; - local->cont.ftruncate.ino = fd->inode->ino; + GF_IF_INTERNAL_XATTR_GOTO("trusted.afr.*", dict, op_errno, out); - local->fd = fd_ref (fd); + GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out); - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - goto out; - } + ret = afr_handle_special_xattr(this, frame, loc, dict); + if (ret == 0) + return 0; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - if (fd_ctx->up_count < priv->up_count) { - local->openfd_flush_cbk = afr_do_ftruncate; - afr_openfd_flush (frame, this, fd); - } else { - afr_do_ftruncate (frame, this); - } + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, NULL); - } + local->cont.setxattr.dict = dict_ref(dict); + local->cont.setxattr.flags = flags; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - return 0; -} + if (!local->xdata_req) + goto out; -/* }}} */ + local->transaction.wind = afr_setxattr_wind; + local->transaction.unwind = afr_setxattr_unwind; -/* {{{ setattr */ + loc_copy(&local->loc, loc); + ret = afr_set_inode_local(this, local, loc->inode); + if (ret) + goto out; -int -afr_setattr_unwind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - local = frame->local; + local->op = GF_FOP_SETXATTR; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - if (main_frame) { - local->cont.setattr.preop_buf.ia_ino = local->cont.setattr.ino; - local->cont.setattr.postop_buf.ia_ino = local->cont.setattr.ino; + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, - local->op_errno, - &local->cont.setattr.preop_buf, - &local->cont.setattr.postop_buf); - } + AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL); - return 0; + return 0; } +/* {{{ fsetxattr */ int -afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) +afr_fsetxattr_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - read_child = afr_read_child (this, local->loc.inode); + local = frame->local; - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); + AFR_STACK_UNWIND(fsetxattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; +} - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.setattr.preop_buf = *preop; - local->cont.setattr.postop_buf = *postop; - } +int +afr_fsetxattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, NULL, xdata); +} - if (child_index == read_child) { - local->cont.setattr.preop_buf = *preop; - local->cont.setattr.postop_buf = *postop; - } +int +afr_fsetxattr_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_fsetxattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsetxattr, local->fd, + local->cont.fsetxattr.dict, local->cont.fsetxattr.flags, + local->xdata_req); + return 0; +} - local->success_count++; +int +afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + GF_IF_INTERNAL_XATTR_GOTO("trusted.afr.*", dict, op_errno, out); - if (need_unwind) - local->transaction.unwind (frame, this); + GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out); - call_count = afr_frame_return (frame); + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - if (call_count == 0) { - local->transaction.resume (frame, this); - } + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - return 0; -} + local->cont.fsetxattr.dict = dict_ref(dict); + local->cont.fsetxattr.flags = flags; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); -int32_t -afr_setattr_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + if (!local->xdata_req) + goto out; - int call_count = -1; - int i = 0; + local->transaction.wind = afr_fsetxattr_wind; + local->transaction.unwind = afr_fsetxattr_unwind; - local = frame->local; - priv = this->private; + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - call_count = afr_up_children_count (priv->child_count, local->child_up); + local->op = GF_FOP_FSETXATTR; - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - local->call_count = call_count; + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, - &local->cont.setattr.in_buf, - local->cont.setattr.valid); - - if (!--call_count) - break; - } - } + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL); + return 0; } +/* }}} */ + +/* {{{ removexattr */ int -afr_setattr_done (call_frame_t *frame, xlator_t *this) +afr_removexattr_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; + + AFR_STACK_UNWIND(removexattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; } +int +afr_removexattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, NULL, xdata); +} int -afr_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid) +afr_removexattr_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - int ret = -1; + local = frame->local; + priv = this->private; - int op_ret = -1; - int op_errno = 0; + STACK_WIND_COOKIE(frame, afr_removexattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->removexattr, &local->loc, + local->cont.removexattr.name, local->xdata_req); + return 0; +} - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); +int +afr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - priv = this->private; + GF_IF_NATIVE_XATTR_GOTO("trusted.afr.*", name, op_errno, out); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } + GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out); - ALLOC_OR_GOTO (local, afr_local_t, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - transaction_frame->local = local; + local->cont.removexattr.name = gf_strdup(name); - local->op_ret = -1; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - local->cont.setattr.ino = loc->inode->ino; + if (!local->xdata_req) + goto out; - local->cont.setattr.in_buf = *buf; - local->cont.setattr.valid = valid; + local->transaction.wind = afr_removexattr_wind; + local->transaction.unwind = afr_removexattr_unwind; - local->transaction.fop = afr_setattr_wind; - local->transaction.done = afr_setattr_done; - local->transaction.unwind = afr_setattr_unwind; + loc_copy(&local->loc, loc); + ret = afr_set_inode_local(this, local, loc->inode); + if (ret) + goto out; - loc_copy (&local->loc, loc); + local->op = GF_FOP_REMOVEXATTR; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setattr, frame, op_ret, op_errno, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL); + return 0; } -/* {{{ fsetattr */ - +/* ffremovexattr */ int -afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) +afr_fremovexattr_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - local->cont.fsetattr.preop_buf.ia_ino = - local->cont.fsetattr.ino; - local->cont.fsetattr.postop_buf.ia_ino = - local->cont.fsetattr.ino; - - AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, - local->op_errno, - &local->cont.fsetattr.preop_buf, - &local->cont.fsetattr.postop_buf); - } + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(fremovexattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; +} int -afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) +afr_fremovexattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_read_child (this, local->fd->inode); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, NULL, xdata); +} - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); +int +afr_fremovexattr_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.fsetattr.preop_buf = *preop; - local->cont.fsetattr.postop_buf = *postop; - } + local = frame->local; + priv = this->private; - if (child_index == read_child) { - local->cont.fsetattr.preop_buf = *preop; - local->cont.fsetattr.postop_buf = *postop; - } + STACK_WIND_COOKIE(frame, afr_fremovexattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fremovexattr, local->fd, + local->cont.removexattr.name, local->xdata_req); + return 0; +} - local->success_count++; +int +afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + GF_IF_NATIVE_XATTR_GOTO("trusted.afr.*", name, op_errno, out); - if (need_unwind) - local->transaction.unwind (frame, this); + GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out); - call_count = afr_frame_return (frame); + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - if (call_count == 0) { - local->transaction.resume (frame, this); - } + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - return 0; -} + local->cont.removexattr.name = gf_strdup(name); + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + if (!local->xdata_req) + goto out; -int32_t -afr_fsetattr_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + local->transaction.wind = afr_fremovexattr_wind; + local->transaction.unwind = afr_fremovexattr_unwind; - int call_count = -1; - int i = 0; + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - local = frame->local; - priv = this->private; + local->op = GF_FOP_FREMOVEXATTR; - call_count = afr_up_children_count (priv->child_count, local->child_up); + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - local->call_count = call_count; + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fsetattr, - local->fd, - &local->cont.fsetattr.in_buf, - local->cont.fsetattr.valid); - - if (!--call_count) - break; - } - } + AFR_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL); - return 0; + return 0; } - int -afr_fsetattr_done (call_frame_t *frame, xlator_t *this) +afr_fallocate_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(fallocate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} int -afr_fsetattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *buf, int32_t valid) +afr_fallocate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - - int ret = -1; + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); +} - int op_ret = -1; - int op_errno = 0; +int +afr_fallocate_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_fallocate_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fallocate, local->fd, + local->cont.fallocate.mode, local->cont.fallocate.offset, + local->cont.fallocate.len, local->xdata_req); + return 0; +} - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); +int +afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int op_errno = ENOMEM; - priv = this->private; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - ALLOC_OR_GOTO (local, afr_local_t, out); + local->cont.fallocate.mode = mode; + local->cont.fallocate.offset = offset; + local->cont.fallocate.len = len; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - transaction_frame->local = local; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - local->op_ret = -1; + if (!local->xdata_req) + goto out; - local->cont.fsetattr.ino = fd->inode->ino; + local->op = GF_FOP_FALLOCATE; - local->cont.fsetattr.in_buf = *buf; - local->cont.fsetattr.valid = valid; + local->transaction.wind = afr_fallocate_wind; + local->transaction.unwind = afr_fallocate_unwind; - local->transaction.fop = afr_fsetattr_wind; - local->transaction.done = afr_fsetattr_done; - local->transaction.unwind = afr_fsetattr_unwind; + local->transaction.main_frame = frame; - local->fd = fd_ref (fd); + local->transaction.start = local->cont.fallocate.offset; + local->transaction.len = 0; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + afr_fix_open(fd, this); - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } +/* }}} */ -/* {{{ setxattr */ - +/* {{{ discard */ int -afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) +afr_discard_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + local = frame->local; - if (main_frame) { - AFR_STACK_UNWIND (setxattr, main_frame, - local->op_ret, local->op_errno) - } + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(discard, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} int -afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_discard_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int need_unwind = 0; + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); +} - local = frame->local; - priv = this->private; +int +afr_discard_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_discard_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->discard, local->fd, + local->cont.discard.offset, local->cont.discard.len, + local->xdata_req); + return 0; +} - LOCK (&frame->lock); - { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; +int +afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - if (local->success_count == priv->child_count) { - need_unwind = 1; - } - } + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - if (need_unwind) - local->transaction.unwind (frame, this); + local->cont.discard.offset = offset; + local->cont.discard.len = len; - call_count = afr_frame_return (frame); + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - if (call_count == 0) { - local->transaction.resume (frame, this); - } + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - return 0; -} + if (!local->xdata_req) + goto out; + local->op = GF_FOP_DISCARD; -int -afr_setxattr_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + local->transaction.wind = afr_discard_wind; + local->transaction.unwind = afr_discard_unwind; - int call_count = -1; - int i = 0; + local->transaction.main_frame = frame; - local = frame->local; - priv = this->private; + local->transaction.start = local->cont.discard.offset; + local->transaction.len = 0; - call_count = afr_up_children_count (priv->child_count, local->child_up); + afr_fix_open(fd, this); - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, - local->cont.setxattr.dict, - local->cont.setxattr.flags); - - if (!--call_count) - break; - } - } + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } +/* {{{ zerofill */ int -afr_setxattr_done (call_frame_t *frame, xlator_t *this) +afr_zerofill_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - AFR_STACK_DESTROY (frame); + local = frame->local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; + + AFR_STACK_UNWIND(discard, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; } int -afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags) +afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); +} - int ret = -1; +int +afr_zerofill_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_zerofill_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->zerofill, local->fd, + local->cont.zerofill.offset, local->cont.zerofill.len, + local->xdata_req); + return 0; +} - int op_ret = -1; - int op_errno = 0; +int +afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - priv = this->private; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - ALLOC_OR_GOTO (local, afr_local_t, out); + local->cont.zerofill.offset = offset; + local->cont.zerofill.len = len; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - transaction_frame->local = local; + if (!local->xdata_req) + goto out; - local->op_ret = -1; + local->op = GF_FOP_ZEROFILL; - local->cont.setxattr.dict = dict_ref (dict); - local->cont.setxattr.flags = flags; + local->transaction.wind = afr_zerofill_wind; + local->transaction.unwind = afr_zerofill_unwind; - local->transaction.fop = afr_setxattr_wind; - local->transaction.done = afr_setxattr_done; - local->transaction.unwind = afr_setxattr_unwind; + local->transaction.main_frame = frame; - loc_copy (&local->loc, loc); + local->transaction.start = local->cont.zerofill.offset; + local->transaction.len = len; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + afr_fix_open(fd, this); - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } /* }}} */ -/* {{{ removexattr */ +int32_t +afr_xattrop_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) +{ + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, xattr, xdata); +} +int +afr_xattrop_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_xattrop_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->xattrop, &local->loc, + local->cont.xattrop.optype, local->cont.xattrop.xattr, + local->xdata_req); + return 0; +} int -afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) +afr_xattrop_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; + local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (removexattr, main_frame, - local->op_ret, local->op_errno) - } + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(xattrop, main_frame, local->op_ret, local->op_errno, + local->xattr_rsp, local->xdata_rsp); + return 0; +} -int -afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +int32_t +afr_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - int call_count = -1; - int need_unwind = 0; + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local = frame->local; - priv = this->private; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - LOCK (&frame->lock); - { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; + local->cont.xattrop.xattr = dict_ref(xattr); + local->cont.xattrop.optype = optype; + if (xdata) + local->xdata_req = dict_ref(xdata); - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } + local->transaction.wind = afr_xattrop_wind; + local->transaction.unwind = afr_xattrop_unwind; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + loc_copy(&local->loc, loc); + ret = afr_set_inode_local(this, local, loc->inode); + if (ret) + goto out; - if (need_unwind) - local->transaction.unwind (frame, this); + local->op = GF_FOP_XATTROP; - call_count = afr_frame_return (frame); + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - if (call_count == 0) { - local->transaction.resume (frame, this); - } + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - return 0; -} + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); + AFR_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL); + return 0; +} int32_t -afr_removexattr_wind (call_frame_t *frame, xlator_t *this) +afr_fxattrop_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->removexattr, - &local->loc, - local->cont.removexattr.name); - - if (!--call_count) - break; - } - } - - return 0; + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, xattr, xdata); } - int -afr_removexattr_done (call_frame_t *frame, xlator_t *this) +afr_fxattrop_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_fxattrop_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fxattrop, local->fd, + local->cont.xattrop.optype, local->cont.xattrop.xattr, + local->xdata_req); + return 0; } - int -afr_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) +afr_fxattrop_unwind(call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - int ret = -1; + local = frame->local; - int op_ret = -1; - int op_errno = 0; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); + AFR_STACK_UNWIND(fxattrop, main_frame, local->op_ret, local->op_errno, + local->xattr_rsp, local->xdata_rsp); + return 0; +} - priv = this->private; +int32_t +afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + local->cont.xattrop.xattr = dict_ref(xattr); + local->cont.xattrop.optype = optype; + if (xdata) + local->xdata_req = dict_ref(xdata); + + local->transaction.wind = afr_fxattrop_wind; + local->transaction.unwind = afr_fxattrop_unwind; + + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; + + local->op = GF_FOP_FXATTROP; + + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; + + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } + AFR_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL); + return 0; +} - ALLOC_OR_GOTO (local, afr_local_t, out); +int +afr_fsync_unwind(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = frame->local; - transaction_frame->local = local; + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) + return 0; - local->op_ret = -1; + AFR_STACK_UNWIND(fsync, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); - local->cont.removexattr.name = gf_strdup (name); + return 0; +} - local->transaction.fop = afr_removexattr_wind; - local->transaction.done = afr_removexattr_done; - local->transaction.unwind = afr_removexattr_unwind; +int +afr_fsync_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); +} - loc_copy (&local->loc, loc); +int +afr_fsync_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + local = frame->local; + priv = this->private; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + STACK_WIND_COOKIE(frame, afr_fsync_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsync, local->fd, + local->cont.fsync.datasync, local->xdata_req); + return 0; +} - op_ret = 0; +int +afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int32_t op_errno = ENOMEM; + int8_t last_fsync = 0; + + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + if (xdata) { + local->xdata_req = dict_copy_with_ref(xdata, NULL); + if (dict_get_int8(xdata, "last-fsync", &last_fsync) == 0) { + if (last_fsync) { + local->transaction.disable_delayed_post_op = _gf_true; + } + } + } else { + local->xdata_req = dict_new(); + } + + if (!local->xdata_req) + goto out; + + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; + + local->op = GF_FOP_FSYNC; + local->cont.fsync.datasync = datasync; + + if (afr_fd_has_witnessed_unstable_write(this, fd->inode)) { + /* don't care. we only wanted to CLEAR the bit */ + } + + local->transaction.wind = afr_fsync_wind; + local->transaction.unwind = afr_fsync_unwind; + + local->transaction.main_frame = frame; + + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (removexattr, frame, op_ret, op_errno); - } + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - return 0; + AFR_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; } diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h index 47589872206..a787069b7a1 100644 --- a/xlators/cluster/afr/src/afr-inode-write.h +++ b/xlators/cluster/afr/src/afr-inode-write.h @@ -1,72 +1,94 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __INODE_WRITE_H__ #define __INODE_WRITE_H__ int32_t -afr_chmod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode); +afr_chmod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dict_t *xdata); int32_t -afr_chown (call_frame_t *frame, xlator_t *this, - loc_t *loc, uid_t uid, gid_t gid); +afr_chown(call_frame_t *frame, xlator_t *this, loc_t *loc, uid_t uid, gid_t gid, + dict_t *xdata); int -afr_fchown (call_frame_t *frame, xlator_t *this, - fd_t *fd, uid_t uid, gid_t gid); +afr_fchown(call_frame_t *frame, xlator_t *this, fd_t *fd, uid_t uid, gid_t gid, + dict_t *xdata); + +int32_t +afr_fchmod(call_frame_t *frame, xlator_t *this, fd_t *fd, mode_t mode, + dict_t *xdata); + +int32_t +afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata); + +int32_t +afr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata); + +int32_t +afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata); int32_t -afr_fchmod (call_frame_t *frame, xlator_t *this, - fd_t *fd, mode_t mode); +afr_utimens(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct timespec tv[2], dict_t *xdata); + +int +afr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, + int32_t valid, dict_t *xdata); + +int +afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, + int32_t valid, dict_t *xdata); int32_t -afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref); +afr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata); int32_t -afr_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset); +afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata); int32_t -afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset); +afr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata); int32_t -afr_utimens (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct timespec tv[2]); +afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata); + +int +afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); int -afr_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid); +afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata); int -afr_fsetattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *buf, int32_t valid); +afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata); int32_t -afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags); +afr_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata); int32_t -afr_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name); +afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata); +int +afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata); #endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index d2f7579fc42..bc8eabe0f43 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -1,2183 +1,791 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#include "dict.h" -#include "byte-order.h" -#include "common-utils.h" +#include <glusterfs/dict.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/common-utils.h> #include "afr.h" #include "afr-transaction.h" +#include "afr-messages.h" #include <signal.h> - -#define LOCKED_NO 0x0 /* no lock held */ -#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */ -#define LOCKED_LOWER 0x2 /* for lower path */ - -int -afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); - -static uint64_t afr_lock_number = 1; - -static uint64_t -get_afr_lock_number () -{ - return (++afr_lock_number); -} - -int -afr_set_lock_number (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->lock_number = get_afr_lock_number (); - - return 0; -} +#define LOCKED_NO 0x0 /* no lock held */ +#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */ +#define LOCKED_LOWER 0x2 /* for lower path */ void -afr_set_lk_owner (call_frame_t *frame, xlator_t *this) -{ - if (!frame->root->lk_owner) { - gf_log (this->name, GF_LOG_TRACE, - "Setting lk-owner=%llu", - (unsigned long long) (unsigned long)frame->root); - frame->root->lk_owner = (uint64_t) (unsigned long)frame->root; - } -} - -static int -is_afr_lock_selfheal (afr_local_t *local) +afr_lockee_cleanup(afr_lockee_t *lockee) { - afr_internal_lock_t *int_lock = NULL; - int ret = -1; - - int_lock = &local->internal_lock; - - switch (int_lock->selfheal_lk_type) { - case AFR_DATA_SELF_HEAL_LK: - case AFR_METADATA_SELF_HEAL_LK: - ret = 1; - break; - case AFR_ENTRY_SELF_HEAL_LK: - ret = 0; - break; - } + if (lockee->fd) { + fd_unref(lockee->fd); + lockee->fd = NULL; + } else { + loc_wipe(&lockee->loc); + } - return ret; + GF_FREE(lockee->basename); + lockee->basename = NULL; + GF_FREE(lockee->locked_nodes); + lockee->locked_nodes = NULL; -} - -int32_t -internal_lock_count (call_frame_t *frame, xlator_t *this, - afr_fd_ctx_t *fd_ctx) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int32_t call_count = 0; - int i = 0; - - local = frame->local; - priv = this->private; - - if (fd_ctx) { - GF_ASSERT (local->fd); - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && fd_ctx->opened_on[i]) - ++call_count; - } - } else { - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) - ++call_count; - } - } - - return call_count; -} - -static void -afr_print_inodelk (char *str, int size, int cmd, - struct gf_flock *flock, uint64_t owner) -{ - char *cmd_str = NULL; - char *type_str = NULL; - - switch (cmd) { -#if F_GETLK != F_GETLK64 - case F_GETLK64: -#endif - case F_GETLK: - cmd_str = "GETLK"; - break; - -#if F_SETLK != F_SETLK64 - case F_SETLK64: -#endif - case F_SETLK: - cmd_str = "SETLK"; - break; - -#if F_SETLKW != F_SETLKW64 - case F_SETLKW64: -#endif - case F_SETLKW: - cmd_str = "SETLKW"; - break; - - default: - cmd_str = "<null>"; - break; - } - - switch (flock->l_type) { - case F_RDLCK: - type_str = "READ"; - break; - case F_WRLCK: - type_str = "WRITE"; - break; - case F_UNLCK: - type_str = "UNLOCK"; - break; - default: - type_str = "UNKNOWN"; - break; - } - - snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, " - "start=%llu, len=%llu, pid=%llu, lk-owner=%llu", - cmd_str, type_str, (unsigned long long) flock->l_start, - (unsigned long long) flock->l_len, - (unsigned long long) flock->l_pid, - (unsigned long long) owner); - -} - -static void -afr_print_lockee (char *str, int size, loc_t *loc, fd_t *fd, - int child_index) -{ - snprintf (str, size, "path=%s, fd=%p, child=%d", - loc->path ? loc->path : "<nul>", - fd ? fd : NULL, - child_index); + return; } void -afr_print_entrylk (char *str, int size, const char *basename, - uint64_t owner) +afr_lockees_cleanup(afr_internal_lock_t *int_lock) { - snprintf (str, size, "Basename=%s, lk-owner=%llu", - basename ? basename : "<nul>", - (unsigned long long)owner); -} + int i = 0; -static void -afr_print_verdict (int op_ret, int op_errno, char *str) -{ - if (op_ret < 0) { - if (op_errno == EAGAIN) - strcpy (str, "EAGAIN"); - else - strcpy (str, "FAILED"); - } - else - strcpy (str, "GRANTED"); -} - -static void -afr_set_lock_call_type (afr_lock_call_type_t lock_call_type, - char *lock_call_type_str, - afr_internal_lock_t *int_lock) -{ - switch (lock_call_type) { - case AFR_INODELK_TRANSACTION: - if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) - strcpy (lock_call_type_str, "AFR_INODELK_TRANSACTION"); - else - strcpy (lock_call_type_str, "AFR_INODELK_SELFHEAL"); - break; - case AFR_INODELK_NB_TRANSACTION: - if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) - strcpy (lock_call_type_str, "AFR_INODELK_NB_TRANSACTION"); - else - strcpy (lock_call_type_str, "AFR_INODELK_NB_SELFHEAL"); - break; - case AFR_ENTRYLK_TRANSACTION: - if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) - strcpy (lock_call_type_str, "AFR_ENTRYLK_TRANSACTION"); - else - strcpy (lock_call_type_str, "AFR_ENTRYLK_SELFHEAL"); - break; - case AFR_ENTRYLK_NB_TRANSACTION: - if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) - strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_TRANSACTION"); - else - strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_SELFHEAL"); - break; - default: - strcpy (lock_call_type_str, "UNKNOWN"); - break; - } - -} - -static void -afr_trace_inodelk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, - afr_lock_op_type_t lk_op_type, struct gf_flock *flock, - int op_ret, int op_errno, int32_t child_index) -{ - xlator_t *this = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - char lockee[256]; - char lock_call_type_str[256]; - char verdict[16]; - - this = THIS; - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - if (!priv->inodelk_trace) { - return; - } - - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); - - afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); - - afr_print_verdict (op_ret, op_errno, verdict); - - gf_log (this->name, GF_LOG_NORMAL, - "[%s %s] [%s] Lockee={%s} Number={%llu}", - lock_call_type_str, - lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", - verdict, - lockee, - (unsigned long long) int_lock->lock_number); + for (i = 0; i < int_lock->lockee_count; i++) { + afr_lockee_cleanup(&int_lock->lockee[i]); + } + return; } - -static void -afr_trace_inodelk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, - afr_lock_op_type_t lk_op_type, struct gf_flock *flock, - int32_t cmd, int32_t child_index) -{ - xlator_t *this = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - - char lock[256]; - char lockee[256]; - char lock_call_type_str[256]; - - this = THIS; - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - if (!priv->inodelk_trace) { - return; - } - - afr_print_inodelk (lock, 256, cmd, flock, frame->root->lk_owner); - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); - - afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); - - gf_log (this->name, GF_LOG_NORMAL, - "[%s %s] Lock={%s} Lockee={%s} Number={%llu}", - lock_call_type_str, - lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", - lock, lockee, - (unsigned long long) int_lock->lock_number); - +int +afr_entry_lockee_cmp(const void *l1, const void *l2) +{ + const afr_lockee_t *r1 = l1; + const afr_lockee_t *r2 = l2; + int ret = 0; + uuid_t gfid1 = {0}; + uuid_t gfid2 = {0}; + + loc_gfid((loc_t *)&r1->loc, gfid1); + loc_gfid((loc_t *)&r2->loc, gfid2); + ret = gf_uuid_compare(gfid1, gfid2); + /*Entrylks with NULL basename are the 'smallest'*/ + if (ret == 0) { + if (!r1->basename) + return -1; + if (!r2->basename) + return 1; + ret = strcmp(r1->basename, r2->basename); + } + + if (ret <= 0) + return -1; + else + return 1; } -static void -afr_trace_entrylk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, - afr_lock_op_type_t lk_op_type, const char *basename, - int32_t child_index) -{ - xlator_t *this = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - - char lock[256]; - char lockee[256]; - char lock_call_type_str[256]; - - this = THIS; - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - if (!priv->entrylk_trace) { - return; - } - - afr_print_entrylk (lock, 256, basename, frame->root->lk_owner); - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); - - afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); - - gf_log (this->name, GF_LOG_NORMAL, - "[%s %s] Lock={%s} Lockee={%s} Number={%llu}", - lock_call_type_str, - lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", - lock, lockee, - (unsigned long long) int_lock->lock_number); -} +int +afr_lock_blocking(call_frame_t *frame, xlator_t *this, int child_index); -static void -afr_trace_entrylk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, - afr_lock_op_type_t lk_op_type, const char *basename, int op_ret, - int op_errno, int32_t child_index) +void +afr_set_lk_owner(call_frame_t *frame, xlator_t *this, void *lk_owner) { - xlator_t *this = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - char lock[256]; - char lockee[256]; - char lock_call_type_str[256]; - char verdict[16]; - - this = THIS; - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - if (!priv->entrylk_trace) { - return; - } - - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); - - afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); - - afr_print_verdict (op_ret, op_errno, verdict); - - gf_log (this->name, GF_LOG_NORMAL, - "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu}", - lock_call_type_str, - lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", - verdict, - lock, lockee, - (unsigned long long) int_lock->lock_number); + gf_msg_trace(this->name, 0, "Setting lk-owner=%llu", + (unsigned long long)(unsigned long)lk_owner); + set_lk_owner_from_ptr(&frame->root->lk_owner, lk_owner); } -static int -transaction_lk_op (afr_local_t *local) +int32_t +internal_lock_count(call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - int ret = -1; - - int_lock = &local->internal_lock; - - if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) { - gf_log (THIS->name, GF_LOG_DEBUG, - "lk op is for a transaction"); - ret = 1; - } - else if (int_lock->transaction_lk_type == AFR_SELFHEAL_LK) { - gf_log (THIS->name, GF_LOG_DEBUG, - "lk op is for a self heal"); - - ret = 0; - } + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int32_t call_count = 0; + int i = 0; - if (ret == -1) - gf_log (THIS->name, GF_LOG_DEBUG, - "lk op is not set"); + local = frame->local; + priv = this->private; - return ret; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) + ++call_count; + } + return call_count; } -static int -is_afr_lock_transaction (afr_local_t *local) +int +afr_add_entry_lockee(afr_local_t *local, loc_t *loc, char *basename, + int child_count) { - int ret = 0; + int ret = -ENOMEM; + afr_internal_lock_t *int_lock = &local->internal_lock; + afr_lockee_t *lockee = &int_lock->lockee[int_lock->lockee_count]; - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - ret = 1; - break; + GF_ASSERT(int_lock->lockee_count < AFR_LOCKEE_COUNT_MAX); + loc_copy(&lockee->loc, loc); + lockee->basename = (basename) ? gf_strdup(basename) : NULL; + if (basename && !lockee->basename) + goto out; - case AFR_ENTRY_RENAME_TRANSACTION: - case AFR_ENTRY_TRANSACTION: - ret = 0; - break; + lockee->locked_count = 0; + lockee->locked_nodes = GF_CALLOC(child_count, sizeof(*lockee->locked_nodes), + gf_afr_mt_afr_node_character); - } + if (!lockee->locked_nodes) + goto out; - return ret; + ret = 0; + int_lock->lockee_count++; +out: + if (ret) { + afr_lockee_cleanup(lockee); + } + return ret; } -static int -initialize_entrylk_variables (call_frame_t *frame, xlator_t *this) +int +afr_add_inode_lockee(afr_local_t *local, int child_count) { - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; + int ret = -ENOMEM; + afr_internal_lock_t *int_lock = &local->internal_lock; + afr_lockee_t *lockee = &int_lock->lockee[int_lock->lockee_count]; - int i = 0; + if (local->fd) { + lockee->fd = fd_ref(local->fd); + } else { + loc_copy(&lockee->loc, &local->loc); + } - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; + lockee->locked_count = 0; + lockee->locked_nodes = GF_CALLOC(child_count, sizeof(*lockee->locked_nodes), + gf_afr_mt_afr_node_character); - int_lock->entrylk_lock_count = 0; - int_lock->lock_op_ret = -1; - int_lock->lock_op_errno = 0; + if (!lockee->locked_nodes) + goto out; - for (i = 0; i < priv->child_count; i++) { - int_lock->entry_locked_nodes[i] = 0; - } - - return 0; + ret = 0; + int_lock->lockee_count++; +out: + if (ret) { + afr_lockee_cleanup(lockee); + } + return ret; } static int -initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) +initialize_internal_lock_variables(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_private_t *priv = NULL; - int i = 0; + int i = 0; - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; + priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; - int_lock->inodelk_lock_count = 0; - int_lock->lock_op_ret = -1; - int_lock->lock_op_errno = 0; + int_lock->lock_count = 0; + int_lock->lock_op_ret = -1; + int_lock->lock_op_errno = 0; + int_lock->lk_attempted_count = 0; - for (i = 0; i < priv->child_count; i++) { - int_lock->inode_locked_nodes[i] = 0; - } + for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) { + if (!int_lock->lockee[i].locked_nodes) + break; + int_lock->lockee[i].locked_count = 0; + memset(int_lock->lockee[i].locked_nodes, 0, + sizeof(*int_lock->lockee[i].locked_nodes) * priv->child_count); + } - return 0; -} - -loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) -{ - int ret = 0; - - ret = strcmp (l1->path, l2->path); - - if (ret == 0) - ret = strcmp (b1, b2); - - if (ret <= 0) - return l1; - else - return l2; + return 0; } int -afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) - +afr_lockee_locked_nodes_count(afr_internal_lock_t *int_lock) { - int i; - int call_count = 0; + int call_count = 0; + int i = 0; - for (i = 0; i < child_count; i++) { - if (locked_nodes[i] & LOCKED_YES) - call_count++; - } + for (i = 0; i < int_lock->lockee_count; i++) + call_count += int_lock->lockee[i].locked_count; - return call_count; + return call_count; } -/* FIXME: What if UNLOCK fails */ -static int32_t -afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - int call_count = 0; - - local = frame->local; - int_lock = &local->internal_lock; - - LOCK (&frame->lock); - { - call_count = --int_lock->lk_call_count; - } - UNLOCK (&frame->lock); - - if (call_count == 0) { - gf_log (this->name, GF_LOG_TRACE, - "All internal locks unlocked"); - int_lock->lock_cbk (frame, this); - } - - return 0; -} +int +afr_locked_nodes_count(unsigned char *locked_nodes, int child_count) -static int32_t -afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) { - afr_trace_inodelk_out (frame, AFR_INODELK_TRANSACTION, - AFR_UNLOCK_OP, NULL, op_ret, - op_errno, (long) cookie); - - if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { - gf_log (this->name, GF_LOG_TRACE, - "Unlock failed for some reason"); - } - - afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno); + int i = 0; + int call_count = 0; - return 0; + for (i = 0; i < child_count; i++) { + if (locked_nodes[i] & LOCKED_YES) + call_count++; + } + return call_count; } -static int -afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) +static void +afr_log_locks_failure(call_frame_t *frame, char *where, char *what, + int op_errno) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - struct gf_flock flock; - int call_count = 0; - int i = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = F_UNLCK; - - call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes, - priv->child_count); + xlator_t *this = frame->this; + gf_lkowner_t *lk_owner = &frame->root->lk_owner; + afr_local_t *local = frame->local; + const char *fop = NULL; + char *gfid = NULL; + const char *name = NULL; - int_lock->lk_call_count = call_count; + fop = gf_fop_list[local->op]; - if (!call_count) { - gf_log (this->name, GF_LOG_TRACE, - "No internal locks unlocked"); - int_lock->lock_cbk (frame, this); - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - if (int_lock->inode_locked_nodes[i] & LOCKED_YES) { - if (local->fd) { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, - AFR_UNLOCK_OP, &flock, F_SETLK, i); - - STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, - (void *) (long)i, - priv->children[i], - priv->children[i]->fops->finodelk, - this->name, local->fd, - F_SETLK, &flock); - - if (!--call_count) - break; - - } else { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, - AFR_UNLOCK_OP, &flock, F_SETLK, i); - - STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, - (void *) (long)i, - priv->children[i], - priv->children[i]->fops->inodelk, - this->name, &local->loc, - F_SETLK, &flock); - - if (!--call_count) - break; - - } - - } - - } - -out: - return 0; + switch (local->transaction.type) { + case AFR_ENTRY_RENAME_TRANSACTION: + case AFR_ENTRY_TRANSACTION: + switch (local->op) { + case GF_FOP_LINK: + gfid = uuid_utoa(local->newloc.pargfid); + name = local->newloc.name; + break; + default: + gfid = uuid_utoa(local->loc.pargfid); + name = local->loc.name; + break; + } + gf_msg(this->name, GF_LOG_WARNING, op_errno, + AFR_MSG_INTERNAL_LKS_FAILED, + "Unable to do entry %s with lk-owner:%s on %s " + "while attempting %s on {pgfid:%s, name:%s}.", + what, lkowner_utoa(lk_owner), where, fop, gfid, name); + break; + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + gfid = uuid_utoa(local->inode->gfid); + gf_msg(this->name, GF_LOG_WARNING, op_errno, + AFR_MSG_INTERNAL_LKS_FAILED, + "Unable to do inode %s with lk-owner:%s on %s " + "while attempting %s on gfid:%s.", + what, lkowner_utoa(lk_owner), where, fop, gfid); + break; + } } static int32_t -afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_unlock_common_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION, - AFR_UNLOCK_OP, NULL, op_ret, - op_errno, (long) cookie); + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; + int lockee_num = 0; + int call_count = 0; + int child_index = 0; + int ret = 0; - afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno); + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + lockee_num = (int)((long)cookie) / priv->child_count; + child_index = (int)((long)cookie) % priv->child_count; - return 0; -} - -static int -afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - - int call_count = 0; - int i = -1; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; - - call_count = afr_locked_nodes_count (int_lock->entry_locked_nodes, - priv->child_count); - int_lock->lk_call_count = call_count; - - if (!call_count){ - gf_log (this->name, GF_LOG_TRACE, - "No internal locks unlocked"); - int_lock->lock_cbk (frame, this); - goto out; - } + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { + afr_log_locks_failure(frame, priv->children[child_index]->name, + "unlock", op_errno); + } - for (i = 0; i < priv->child_count; i++) { - if (int_lock->entry_locked_nodes[i] & LOCKED_YES) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_UNLOCK_OP, basename, i); - - STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - loc, basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); - - if (!--call_count) - break; - } - } - -out: - return 0; + int_lock->lockee[lockee_num].locked_nodes[child_index] &= LOCKED_NO; + if (local->transaction.type == AFR_DATA_TRANSACTION && op_ret != 1) + ret = afr_write_subvol_reset(frame, this); -} + LOCK(&frame->lock); + { + call_count = --int_lock->lk_call_count; + } + UNLOCK(&frame->lock); -static int32_t -afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int done = 0; - int child_index = (long) cookie; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/posix-locks xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; - done = 1; - } - - local->child_up[child_index] = 0; - local->op_errno = op_errno; - int_lock->lock_op_errno = op_errno; - } - } - UNLOCK (&frame->lock); - - if ((op_ret == -1) && - (op_errno == ENOSYS)) { - afr_unlock (frame, this); - } else { - if (op_ret == 0) { - int_lock->locked_nodes[child_index] - |= LOCKED_YES; - int_lock->lock_count++; - } - afr_lock_blocking (frame, this, child_index + 1); - } + if (call_count == 0) { + int_lock->lock_cbk(frame, this); + } - return 0; + return ret; } -static int32_t -afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_trace_inodelk_out (frame, AFR_INODELK_TRANSACTION, - AFR_LOCK_OP, NULL, op_ret, - op_errno, (long) cookie); - - afr_lock_cbk (frame, cookie, this, op_ret, op_errno); - return 0; - -} - -static int32_t -afr_lock_lower_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *lower_name = NULL; - const char *higher_name = NULL; - - int child_index = (long) cookie; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/posix-locks xlator on server"); - - local->op_ret = op_ret; - } - - local->child_up[child_index] = 0; - local->op_errno = op_errno; - } - } - UNLOCK (&frame->lock); - - if (op_ret != 0) { - afr_unlock (frame, this); - goto out; - } else { - int_lock->lower_locked_nodes[child_index] |= LOCKED_LOWER; - int_lock->lock_count++; - } - - /* The lower path has been locked. Now lock the higher path */ - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, higher_name, child_index); - - - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, higher, higher_name, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - -out: - return 0; -} - -static int32_t -afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, NULL, op_ret, - op_errno, (long)cookie); - - afr_lock_cbk (frame, cookie, this, op_ret, op_errno); - return 0; -} - -static int -afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - memcpy (int_lock->inode_locked_nodes, - int_lock->locked_nodes, - priv->child_count); - int_lock->inodelk_lock_count = int_lock->lock_count; - break; - - case AFR_ENTRY_RENAME_TRANSACTION: +void +afr_internal_lock_wind(call_frame_t *frame, + int32_t (*cbk)(call_frame_t *, void *, xlator_t *, + int32_t, int32_t, dict_t *), + void *cookie, int child, int lockee_num, + gf_boolean_t blocking, gf_boolean_t unlock) +{ + afr_local_t *local = frame->local; + xlator_t *this = frame->this; + afr_private_t *priv = this->private; + afr_internal_lock_t *int_lock = &local->internal_lock; + entrylk_cmd cmd = ENTRYLK_LOCK_NB; + int32_t cmd1 = F_SETLK; + struct gf_flock flock = { + 0, + }; + + switch (local->transaction.type) { case AFR_ENTRY_TRANSACTION: - memcpy (int_lock->entry_locked_nodes, - int_lock->locked_nodes, - priv->child_count); - int_lock->entrylk_lock_count = int_lock->lock_count; - break; - } - - return 0; - -} - -int -afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *lower_name = NULL; - const char *higher_name = NULL; - - struct gf_flock flock; - uint64_t ctx; - int ret = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; - - if (local->fd) { - ret = fd_ctx_get (local->fd, this, &ctx); - - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "unable to get fd ctx for fd=%p", - local->fd); - - local->op_ret = -1; - int_lock->lock_op_ret = -1; - local->op_errno = EINVAL; - int_lock->lock_op_errno = EINVAL; - - afr_copy_locked_nodes (frame, this); - - afr_unlock (frame, this); - - return 0; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - /* skip over children that or down - or don't have the fd open */ - - while ((child_index < priv->child_count) - && (!local->child_up[child_index] - || !fd_ctx->opened_on[child_index])) - - child_index++; - } else { - /* skip over children that are down */ - while ((child_index < priv->child_count) - && !local->child_up[child_index]) - child_index++; - } - - if ((child_index == priv->child_count) && - int_lock->lock_count == 0) { - - gf_log (this->name, GF_LOG_DEBUG, - "unable to lock on even one child"); - - local->op_ret = -1; - int_lock->lock_op_ret = -1; - local->op_errno = EAGAIN; - int_lock->lock_op_errno = EAGAIN; - - afr_copy_locked_nodes (frame, this); - - afr_unlock(frame, this); - - return 0; - - } - - if ((child_index == priv->child_count) - || (int_lock->lock_count == - afr_up_children_count (priv->child_count, - local->child_up))) { - - /* we're done locking */ - - gf_log (this->name, GF_LOG_DEBUG, - "we're done locking"); - - afr_copy_locked_nodes (frame, this); - - int_lock->lock_op_ret = 0; - int_lock->lock_cbk (frame, this); - return 0; - } - - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - - if (local->fd) { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, - AFR_LOCK_OP, &flock, F_SETLKW, - child_index); - - STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->finodelk, - this->name, local->fd, - F_SETLKW, &flock); - - } else { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, - AFR_LOCK_OP, &flock, F_SETLKW, - child_index); - - STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->inodelk, - this->name, &local->loc, - F_SETLKW, &flock); - } - - break; - - case AFR_ENTRY_RENAME_TRANSACTION: - { - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, lower_name, child_index); - - - STACK_WIND_COOKIE (frame, afr_lock_lower_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, lower, lower_name, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - - break; - } - - case AFR_ENTRY_TRANSACTION: - if (local->fd) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, local->transaction.basename, - child_index); - - STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->fentrylk, - this->name, local->fd, - local->transaction.basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - } else { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, local->transaction.basename, - child_index); - - STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, - &local->transaction.parent_loc, - local->transaction.basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - } - - break; - } - - return 0; - - + case AFR_ENTRY_RENAME_TRANSACTION: + if (unlock) { + cmd = ENTRYLK_UNLOCK; + } else if (blocking) { /*Doesn't make sense to have blocking + unlock*/ + cmd = ENTRYLK_LOCK; + } + + if (local->fd) { + STACK_WIND_COOKIE(frame, cbk, cookie, priv->children[child], + priv->children[child]->fops->fentrylk, + int_lock->domain, + int_lock->lockee[lockee_num].fd, + int_lock->lockee[lockee_num].basename, cmd, + ENTRYLK_WRLCK, NULL); + } else { + STACK_WIND_COOKIE(frame, cbk, cookie, priv->children[child], + priv->children[child]->fops->entrylk, + int_lock->domain, + &int_lock->lockee[lockee_num].loc, + int_lock->lockee[lockee_num].basename, cmd, + ENTRYLK_WRLCK, NULL); + } + break; + + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + flock = int_lock->lockee[lockee_num].flock; + if (unlock) { + flock.l_type = F_UNLCK; + } else if (blocking) { /*Doesn't make sense to have blocking + unlock*/ + cmd1 = F_SETLKW; + } + + if (local->fd) { + STACK_WIND_COOKIE( + frame, cbk, cookie, priv->children[child], + priv->children[child]->fops->finodelk, int_lock->domain, + int_lock->lockee[lockee_num].fd, cmd1, &flock, NULL); + } else { + STACK_WIND_COOKIE( + frame, cbk, cookie, priv->children[child], + priv->children[child]->fops->inodelk, int_lock->domain, + &int_lock->lockee[lockee_num].loc, cmd1, &flock, NULL); + } + break; + } } -int32_t -afr_blocking_lock (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - initialize_inodelk_variables (frame, this); - break; - - case AFR_ENTRY_RENAME_TRANSACTION: - case AFR_ENTRY_TRANSACTION: - initialize_entrylk_variables (frame, this); +static int +afr_unlock_now(call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + int lockee_num = 0; + int i = -1; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + + call_count = afr_lockee_locked_nodes_count(int_lock); + + int_lock->lk_call_count = call_count; + + if (!call_count) { + gf_msg_trace(this->name, 0, "No internal locks unlocked"); + int_lock->lock_cbk(frame, this); + goto out; + } + + for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) { + lockee_num = i / priv->child_count; + child_index = i % priv->child_count; + if (int_lock->lockee[lockee_num].locked_nodes[child_index] & + LOCKED_YES) { + afr_internal_lock_wind(frame, afr_unlock_common_cbk, + (void *)(long)i, child_index, lockee_num, + _gf_false, _gf_true); + if (!--call_count) break; } + } - afr_lock_blocking (frame, this, 0); - - return 0; +out: + return 0; } static int32_t -afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = 0; - int child_index = (long) cookie; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, NULL, op_ret, - op_errno, (long) cookie); - - LOCK (&frame->lock); - { - call_count = --int_lock->lk_call_count; - } - UNLOCK (&frame->lock); - - if (op_ret < 0 ) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/posix-locks xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; - - local->child_up[child_index] = 0; - int_lock->lock_op_errno = op_errno; - local->op_errno = op_errno; - } - } else if (op_ret == 0) { - int_lock->entry_locked_nodes[child_index] - |= LOCKED_YES; - int_lock->entrylk_lock_count++; - } - - if (call_count == 0) { - gf_log (this->name, GF_LOG_TRACE, - "Last locking reply received"); - /* all locks successfull. Proceed to call FOP */ - if (int_lock->entrylk_lock_count == - afr_up_children_count (priv->child_count, local->child_up)) { - gf_log (this->name, GF_LOG_TRACE, - "All servers locked. Calling the cbk"); - int_lock->lock_op_ret = 0; - int_lock->lock_cbk (frame, this); - } - /* Not all locks were successfull. Unlock and try locking - again, this time with serially blocking locks */ - else { - gf_log (this->name, GF_LOG_TRACE, - "%d servers locked. Trying again with blocking calls", - int_lock->lock_count); - - afr_unlock(frame, this); +afr_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int cky = (long)cookie; + int child_index = 0; + int lockee_num = 0; + + priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; + + child_index = ((int)cky) % priv->child_count; + lockee_num = ((int)cky) / priv->child_count; + + LOCK(&frame->lock); + { + if (op_ret == -1) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_msg(this->name, GF_LOG_ERROR, ENOSYS, + AFR_MSG_LOCK_XLATOR_NOT_LOADED, + "subvolume does not support locking. " + "please load features/locks xlator on server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + } + + local->op_errno = op_errno; + int_lock->lock_op_errno = op_errno; + } + + int_lock->lk_attempted_count++; + } + UNLOCK(&frame->lock); + + if ((op_ret == -1) && (op_errno == ENOSYS)) { + afr_unlock_now(frame, this); + } else { + if (op_ret == 0) { + int_lock->lockee[lockee_num] + .locked_nodes[child_index] |= LOCKED_YES; + int_lock->lockee[lockee_num].locked_count++; + int_lock->lock_count++; + if (local->transaction.type == AFR_DATA_TRANSACTION) { + LOCK(&local->inode->lock); + { + local->inode_ctx->lock_count++; } + UNLOCK(&local->inode->lock); + } } + afr_lock_blocking(frame, this, cky + 1); + } - return 0; + return 0; } -int -afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) +static gf_boolean_t +_is_lock_wind_needed(afr_local_t *local, int child_index) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - - int32_t call_count = 0; - int i = 0; - uint64_t ctx; - int ret = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - initialize_entrylk_variables (frame, this); - - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; - - if (local->fd) { - ret = fd_ctx_get (local->fd, this, &ctx); - - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "unable to get fd ctx for fd=%p", - local->fd); - - local->op_ret = -1; - int_lock->lock_op_ret = -1; - local->op_errno = EINVAL; - int_lock->lock_op_errno = EINVAL; - - return -1; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - call_count = internal_lock_count (frame, this, fd_ctx); - int_lock->lk_call_count = call_count; + if (!local->child_up[child_index]) + return _gf_false; - if (!call_count) { - gf_log (this->name, GF_LOG_DEBUG, - "fd not open on any subvolumes. aborting."); - afr_unlock (frame, this); - goto out; - } - - /* Send non-blocking entrylk calls only on up children - and where the fd has been opened */ - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && fd_ctx->opened_on[i]) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_LOCK_OP, basename, i); - - STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fentrylk, - this->name, local->fd, - basename, - ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); - } - } - } else { - GF_ASSERT (loc); - - call_count = internal_lock_count (frame, this, NULL); - int_lock->lk_call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_LOCK_OP, basename, i); - - STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, loc, basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - - if (!--call_count) - break; - - } - } - } -out: - return 0; + return _gf_true; } -int32_t -afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +static gf_boolean_t +is_blocking_locks_count_sufficient(call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = 0; - int child_index = (long) cookie; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; + int child = 0; + int nlockee = 0; + int lockee_count = 0; + gf_boolean_t ret = _gf_true; - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - afr_trace_inodelk_out (frame, AFR_INODELK_NB_TRANSACTION, - AFR_LOCK_OP, NULL, op_ret, - op_errno, (long) cookie); - - LOCK (&frame->lock); - { - call_count = --int_lock->lk_call_count; - } - UNLOCK (&frame->lock); - - if (op_ret < 0 ) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/posix-locks xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; - local->child_up[child_index] = 0; - int_lock->lock_op_errno = op_errno; - local->op_errno = op_errno; - } - } else if (op_ret == 0) { - int_lock->inode_locked_nodes[child_index] - |= LOCKED_YES; - int_lock->inodelk_lock_count++; - } + local = frame->local; + priv = this->private; + int_lock = &local->internal_lock; + lockee_count = int_lock->lockee_count; - if (call_count == 0) { - gf_log (this->name, GF_LOG_TRACE, - "Last inode locking reply received"); - /* all locks successfull. Proceed to call FOP */ - if (int_lock->inodelk_lock_count == - afr_up_children_count (priv->child_count, local->child_up)) { - gf_log (this->name, GF_LOG_TRACE, - "All servers locked. Calling the cbk"); - int_lock->lock_op_ret = 0; - int_lock->lock_cbk (frame, this); - } - /* Not all locks were successfull. Unlock and try locking - again, this time with serially blocking locks */ - else { - gf_log (this->name, GF_LOG_TRACE, - "%d servers locked. Trying again with blocking calls", - int_lock->lock_count); - - afr_unlock(frame, this); - } + if (int_lock->lock_count == 0) { + afr_log_locks_failure(frame, "any subvolume", "lock", + int_lock->lock_op_errno); + return _gf_false; + } + /* For FOPS that take multiple sets of locks (mkdir, rename), + * there must be at least one brick on which the locks from + * all lock sets were successful. */ + for (child = 0; child < priv->child_count; child++) { + ret = _gf_true; + for (nlockee = 0; nlockee < lockee_count; nlockee++) { + if (!(int_lock->lockee[nlockee].locked_nodes[child] & LOCKED_YES)) + ret = _gf_false; } + if (ret) + return ret; + } + if (!ret) + afr_log_locks_failure(frame, "all", "lock", int_lock->lock_op_errno); - return 0; + return ret; } int -afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) +afr_lock_blocking(call_frame_t *frame, xlator_t *this, int cookie) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - - int32_t call_count = 0; - uint64_t ctx = 0; - int i = 0; - int ret = 0; - struct gf_flock flock; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; - - initialize_inodelk_variables (frame, this); - - if (local->fd) { - ret = fd_ctx_get (local->fd, this, &ctx); - - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "unable to get fd ctx for fd=%p", - local->fd); - - local->op_ret = -1; - int_lock->lock_op_ret = -1; - local->op_errno = EINVAL; - int_lock->lock_op_errno = EINVAL; - - ret = -1; - goto out; - } + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + uint64_t ctx = 0; + int ret = 0; + int child_index = 0; + int lockee_num = 0; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + child_index = cookie % priv->child_count; + lockee_num = cookie / priv->child_count; - call_count = internal_lock_count (frame, this, fd_ctx); - int_lock->lk_call_count = call_count; - - if (!call_count) { - gf_log (this->name, GF_LOG_DEBUG, - "fd not open on any subvolumes. aborting."); - afr_unlock (frame, this); - goto out; - } + if (local->fd) { + ret = fd_ctx_get(local->fd, this, &ctx); - /* Send non-blocking inodelk calls only on up children - and where the fd has been opened */ - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && fd_ctx->opened_on[i]) { - afr_trace_inodelk_in (frame, AFR_INODELK_NB_TRANSACTION, - AFR_LOCK_OP, &flock, F_SETLK, i); + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_FD_CTX_GET_FAILED, + "unable to get fd ctx for fd=%p", local->fd); - STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->finodelk, - this->name, local->fd, - F_SETLK, &flock); + local->op_ret = -1; + int_lock->lock_op_ret = -1; - if (!--call_count) - break; + afr_unlock_now(frame, this); - } - - } - } else { - call_count = internal_lock_count (frame, this, NULL); - int_lock->lk_call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - afr_trace_inodelk_in (frame, AFR_INODELK_NB_TRANSACTION, - AFR_LOCK_OP, &flock, F_SETLK, i); - - STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->inodelk, - this->name, &local->loc, - F_SETLK, &flock); - - if (!--call_count) - break; - - } - } + return 0; } + } -out: - return ret; -} - -static int -__is_lower_locked (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int count = 0; - int i = 0; + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { + if (!is_blocking_locks_count_sufficient(frame, this)) { + local->op_ret = -1; + int_lock->lock_op_ret = -1; - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; + afr_unlock_now(frame, this); - for (i = 0; i < priv->child_count; i++) { - if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER) - count++; + return 0; } + } - return count; + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { + /* we're done locking */ -} - -static int -__is_higher_locked (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int count = 0; - int i = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (int_lock->locked_nodes[i] & LOCKED_YES) - count++; - } + gf_msg_debug(this->name, 0, "we're done locking"); - return count; - -} - -static int -afr_unlock_lower_entrylk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - - int call_count = 0; - int i = -1; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; - - call_count = __is_lower_locked (frame, this); - int_lock->lk_call_count = call_count; - - if (!call_count){ - gf_log (this->name, GF_LOG_TRACE, - "No internal locks unlocked"); - int_lock->lock_cbk (frame, this); - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_UNLOCK_OP, basename, i); - - STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - loc, basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); - - if (!--call_count) - break; - - } - } - -out: + int_lock->lock_op_ret = 0; + int_lock->lock_cbk(frame, this); return 0; + } -} - - -static int -afr_post_unlock_higher_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.done (frame, this); - return 0; -} - -static int -afr_post_unlock_lower_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *lower_name = NULL; - const char *higher_name = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - if (__is_higher_locked (frame, this)) { - gf_log (this->name, GF_LOG_DEBUG, - "unlocking higher"); - int_lock->lk_basename = higher_name; - int_lock->lk_loc = higher; - int_lock->lock_cbk = afr_post_unlock_higher_cbk; - - afr_unlock_entrylk (frame, this); - } else - local->transaction.done (frame, this); - - return 0; -} - -static int -afr_rename_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *lower_name = NULL; - const char *higher_name = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - - if (__is_lower_locked (frame, this)) { - gf_log (this->name, GF_LOG_DEBUG, - "unlocking lower"); - int_lock->lk_basename = lower_name; - int_lock->lk_loc = lower; - int_lock->lock_cbk = afr_post_unlock_lower_cbk; - - afr_unlock_lower_entrylk (frame, this); - } else - afr_post_unlock_lower_cbk (frame, this); - + if (!_is_lock_wind_needed(local, child_index)) { + afr_lock_blocking(frame, this, cookie + 1); return 0; -} - -static int -afr_rename_transaction (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; + } - return (local->transaction.type == - AFR_ENTRY_RENAME_TRANSACTION); + afr_internal_lock_wind(frame, afr_lock_cbk, (void *)(long)cookie, + child_index, lockee_num, _gf_true, _gf_false); + return 0; } int32_t -afr_unlock (call_frame_t *frame, xlator_t *this) +afr_blocking_lock(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - - local = frame->local; - - if (transaction_lk_op (local)) { - if (is_afr_lock_transaction (local)) - afr_unlock_inodelk (frame, this); - else - if (!afr_rename_transaction (frame, this)) - afr_unlock_entrylk (frame, this); - else - afr_rename_unlock (frame, this); - } else { - if (is_afr_lock_selfheal (local)) - afr_unlock_inodelk (frame, this); - else - afr_unlock_entrylk (frame, this); - } + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int up_count = 0; - return 0; -} + priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; -int -afr_mark_locked_nodes (xlator_t *this, fd_t *fd, - unsigned char *locked_nodes) -{ - afr_private_t *priv = NULL; - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; + up_count = AFR_COUNT(local->child_up, priv->child_count); + int_lock->lk_call_count = int_lock->lk_expected_count = + (int_lock->lockee_count * up_count); + initialize_internal_lock_variables(frame, this); - priv = this->private; - - ret = afr_fd_ctx_set (this, fd); - if (ret) - goto out; + afr_lock_blocking(frame, this, 0); - ret = fd_ctx_get (fd, this, &tmp); - fdctx = (afr_fd_ctx_t *) (long) tmp; - - GF_ASSERT (fdctx->locked_on); - - memcpy (fdctx->locked_on, locked_nodes, - priv->child_count); - -out: - return ret; + return 0; } -static int -__is_fd_saved (xlator_t *this, fd_t *fd) +static int32_t +afr_nb_internal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_locked_fd_t *locked_fd = NULL; - afr_private_t *priv = NULL; - int found = 0; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + int call_count = 0; + int child_index = 0; + int lockee_num = 0; + afr_private_t *priv = NULL; - priv = this->private; + priv = this->private; - list_for_each_entry (locked_fd, &priv->saved_fds, list) { - if (locked_fd->fd == fd) { - found = 1; - break; - } - } + child_index = ((long)cookie) % priv->child_count; + lockee_num = ((long)cookie) / priv->child_count; - return found; -} + local = frame->local; + int_lock = &local->internal_lock; -static int -__afr_save_locked_fd (xlator_t *this, fd_t *fd) -{ - afr_private_t *priv = NULL; - afr_locked_fd_t *locked_fd = NULL; - int ret = 0; - - priv = this->private; - - locked_fd = GF_CALLOC (1, sizeof (*locked_fd), - gf_afr_mt_locked_fd); - if (!locked_fd) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - ret = -1; - goto out; - } - - locked_fd->fd = fd; - INIT_LIST_HEAD (&locked_fd->list); - - list_add_tail (&locked_fd->list, &priv->saved_fds); - -out: - return ret; -} - -int -afr_save_locked_fd (xlator_t *this, fd_t *fd) -{ - afr_private_t *priv = NULL; - int ret = 0; - - priv = this->private; - - pthread_mutex_lock (&priv->mutex); + if (op_ret == 0 && local->transaction.type == AFR_DATA_TRANSACTION) { + LOCK(&local->inode->lock); { - if (__is_fd_saved (this, fd)) { - gf_log (this->name, GF_LOG_DEBUG, - "fd=%p already saved", fd); - goto unlock; - } - - ret = __afr_save_locked_fd (this, fd); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "fd=%p could not be saved", fd); - goto unlock; - } + local->inode_ctx->lock_count++; } -unlock: - pthread_mutex_unlock (&priv->mutex); - - return ret; -} - -static int -afr_lock_recovery_cleanup (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_locked_fd_t *locked_fd = NULL; - - local = frame->local; - - locked_fd = local->locked_fd; - - STACK_DESTROY (frame->root); - afr_local_cleanup (local, this); - - afr_save_locked_fd (this, locked_fd->fd); - - return 0; - -} - -static int -afr_get_source_lock_recovery (xlator_t *this, fd_t *fd) -{ - afr_fd_ctx_t *fdctx = NULL; - afr_private_t *priv = NULL; - uint64_t tmp = 0; - int i = 0; - int source_child = -1; - int ret = 0; - - priv = this->private; - - ret = fd_ctx_get (fd, this, &tmp); - if (ret) - goto out; - - fdctx = (afr_fd_ctx_t *) (long) tmp; - - for (i = 0; i < priv->child_count; i++) { - if (fdctx->locked_on[i]) { - gf_log (this->name, GF_LOG_DEBUG, - "Found lock recovery source=%d", - i); - source_child = i; - break; - } - - } - -out: - return source_child; - -} + UNLOCK(&local->inode->lock); + } -int32_t -afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock); -int32_t -afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int32_t source_child = 0; - struct gf_flock flock = {0,}; - - local = frame->local; - priv = this->private; - - if (op_ret) { - gf_log (this->name, GF_LOG_DEBUG, - "lock recovery failed"); - goto cleanup; + LOCK(&frame->lock); + { + if (op_ret < 0) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_msg(this->name, GF_LOG_ERROR, ENOSYS, + AFR_MSG_LOCK_XLATOR_NOT_LOADED, + "subvolume does not support " + "locking. please load features/locks" + " xlator on server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } + } else if (op_ret == 0) { + int_lock->lockee[lockee_num] + .locked_nodes[child_index] |= LOCKED_YES; + int_lock->lockee[lockee_num].locked_count++; + int_lock->lock_count++; } - source_child = local->source_child; - - memcpy (&flock, lock, sizeof (*lock)); - - STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk, - (void *) (long) source_child, - priv->children[source_child], - priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock); - - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - return 0; -} - -int -afr_recover_lock (call_frame_t *frame, xlator_t *this, - struct gf_flock *flock) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int32_t lock_recovery_child = 0; - - priv = this->private; - local = frame->local; - - lock_recovery_child = local->lock_recovery_child; + call_count = --int_lock->lk_call_count; + } + UNLOCK(&frame->lock); - frame->root->lk_owner = flock->l_owner; - - STACK_WIND_COOKIE (frame, afr_recover_lock_cbk, - (void *) (long) lock_recovery_child, - priv->children[lock_recovery_child], - priv->children[lock_recovery_child]->fops->lk, - local->fd, F_SETLK, flock); - - return 0; -} - -static int -is_afr_lock_eol (struct gf_flock *lock) -{ - int ret = 0; - - if ((lock->l_type == GF_LK_EOL)) - ret = 1; - - return ret; -} - -int32_t -afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) -{ - if (op_ret) { - gf_log (this->name, GF_LOG_DEBUG, - "Failed to get locks on fd"); - goto cleanup; + if (call_count == 0) { + gf_msg_trace(this->name, 0, "Last locking reply received"); + /* all locks successful. Proceed to call FOP */ + if (int_lock->lock_count == int_lock->lk_expected_count) { + gf_msg_trace(this->name, 0, "All servers locked. Calling the cbk"); + int_lock->lock_op_ret = 0; + int_lock->lock_cbk(frame, this); } + /* Not all locks were successful. Unlock and try locking + again, this time with serially blocking locks */ + else { + gf_msg_trace(this->name, 0, + "%d servers locked. Trying again " + "with blocking calls", + int_lock->lock_count); - gf_log (this->name, GF_LOG_DEBUG, - "Got a lock on fd"); - - if (is_afr_lock_eol (lock)) { - gf_log (this->name, GF_LOG_DEBUG, - "Reached EOL on locks on fd"); - goto cleanup; + afr_unlock_now(frame, this); } + } - afr_recover_lock (frame, this, lock); - - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - - return 0; + return 0; } -static int -afr_lock_recovery (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - fd_t *fd = NULL; - int ret = 0; - int32_t source_child = 0; - struct gf_flock flock = {0,}; - - priv = this->private; - local = frame->local; - - fd = local->fd; - - source_child = afr_get_source_lock_recovery (this, fd); - if (source_child < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Could not recover locks due to lock " - "split brain"); - ret = -1; - goto out; +int +afr_lock_nonblocking(call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int child = 0; + int lockee_num = 0; + int32_t call_count = 0; + int i = 0; + int ret = 0; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + + initialize_internal_lock_variables(frame, this); + + if (local->fd) { + fd_ctx = afr_fd_ctx_get(local->fd, this); + if (!fd_ctx) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_FD_CTX_GET_FAILED, + "unable to get fd ctx for fd=%p", local->fd); + + local->op_ret = -1; + int_lock->lock_op_ret = -1; + local->op_errno = EINVAL; + int_lock->lock_op_errno = EINVAL; + + afr_unlock_now(frame, this); + ret = -1; + goto out; + } + } + + call_count = int_lock->lockee_count * internal_lock_count(frame, this); + int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; + + if (!call_count) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INFO_COMMON, + "fd not open on any subvolumes. aborting."); + afr_unlock_now(frame, this); + goto out; + } + + /* Send non-blocking lock calls only on up children + and where the fd has been opened */ + for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) { + child = i % priv->child_count; + lockee_num = i / priv->child_count; + if (local->child_up[child]) { + afr_internal_lock_wind(frame, afr_nb_internal_lock_cbk, + (void *)(long)i, child, lockee_num, + _gf_false, _gf_false); + if (!--call_count) + break; } - - local->source_child = source_child; - - /* the flock can be zero filled as we're querying incrementally - the locks held on the fd. - */ - STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk, - (void *) (long) source_child, - priv->children[source_child], - priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock); - + } out: - return ret; -} - - -static int -afr_mark_fd_opened (xlator_t *this, fd_t *fd, int32_t child_index) -{ - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; - - ret = fd_ctx_get (fd, this, &tmp); - if (ret) - goto out; - - fdctx = (afr_fd_ctx_t *) (long) tmp; - - fdctx->opened_on[child_index] = 1; - -out: - return ret; + return ret; } int32_t -afr_lock_recovery_preopen_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) -{ - int32_t child_index = (long )cookie; - int ret = 0; - - if (op_ret) { - gf_log (this->name, GF_LOG_DEBUG, - "Reopen during lock-recovery failed"); - goto cleanup; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Open succeeded => proceed to recover locks"); - - ret = afr_lock_recovery (frame, this); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "Lock recovery failed"); - goto cleanup; - } - - ret = afr_mark_fd_opened (this, fd, child_index); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "Marking fd open failed"); - goto cleanup; - } - - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - return 0; -} - -static int -afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - uint64_t tmp = 0; - afr_fd_ctx_t *fdctx = NULL; - loc_t loc = {0,}; - int32_t child_index = 0; - int ret = 0; - - priv = this->private; - local = frame->local; - - GF_ASSERT (local && local->fd); - - ret = fd_ctx_get (local->fd, this, &tmp); - fdctx = (afr_fd_ctx_t *) (long) tmp; - GF_ASSERT (fdctx); - - child_index = local->lock_recovery_child; - - inode_path (local->fd->inode, NULL, (char **)&loc.path); - loc.name = strrchr (loc.path, '/'); - loc.inode = inode_ref (local->fd->inode); - loc.parent = inode_parent (local->fd->inode, 0, NULL); - - - STACK_WIND_COOKIE (frame, afr_lock_recovery_preopen_cbk, - (void *)(long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->open, - &loc, fdctx->flags, local->fd, - fdctx->wbflags); - +afr_unlock(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_lock_t *lock = NULL; + + local = frame->local; + + if (!local->transaction.eager_lock_on) + goto out; + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK(&local->inode->lock); + { + list_del_init(&local->transaction.owner_list); + if (list_empty(&lock->owners) && list_empty(&lock->post_op)) { + local->transaction.do_eager_unlock = _gf_true; + /*TODO: Need to get metadata use on_disk and inherit/uninherit + *GF_ASSERT (!local->inode_ctx->on_disk[local->transaction.type]); + *GF_ASSERT (!local->inode_ctx->inherited[local->transaction.type]); + */ + GF_ASSERT(lock->release); + } + } + UNLOCK(&local->inode->lock); + if (!local->transaction.do_eager_unlock) { + local->internal_lock.lock_cbk(frame, this); return 0; -} - -static int -is_fd_opened (fd_t *fd, int32_t child_index) -{ - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; - - ret = fd_ctx_get (fd, THIS, &tmp); - if (ret) - goto out; - - fdctx = (afr_fd_ctx_t *) (long) tmp; - - if (fdctx->opened_on[child_index]) - ret = 1; - -out: - return ret; -} - -int -afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) -{ - call_frame_t *frame = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_locked_fd_t *locked_fd = NULL; - afr_locked_fd_t *tmp = NULL; - int ret = 0; - struct list_head locks_list; - - - priv = this->private; - - if (list_empty (&priv->saved_fds)) - goto out; - - frame = create_frame (this, this->ctx->pool); - if (!frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - ret = -1; - goto out; - } - - local = GF_CALLOC (1, sizeof (*local), - gf_afr_mt_afr_local_t); - if (!local) { - gf_log (this->name, GF_LOG_DEBUG, - "Out of memory"); - ret = -1; - goto out; - } - - AFR_LOCAL_INIT (local, priv); - if (!local) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - ret = -1; - goto out; - } - - frame->local = local; - - INIT_LIST_HEAD (&locks_list); - - pthread_mutex_lock (&priv->mutex); - { - list_splice_init (&priv->saved_fds, &locks_list); - } - pthread_mutex_unlock (&priv->mutex); - - list_for_each_entry_safe (locked_fd, tmp, - &locks_list, list) { - - list_del_init (&locked_fd->list); - - local->fd = fd_ref (locked_fd->fd); - local->lock_recovery_child = child_index; - local->locked_fd = locked_fd; - - if (!is_fd_opened (locked_fd->fd, child_index)) { - gf_log (this->name, GF_LOG_DEBUG, - "attempting open before lock " - "recovery"); - afr_lock_recovery_preopen (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "attempting lock recovery " - "without a preopen"); - afr_lock_recovery (frame, this); - } - } + } out: - return ret; + afr_unlock_now(frame, this); + return 0; } diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 14064ebcd76..816065fb57a 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -1,48 +1,38 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef __AFR_MEM_TYPES_H__ #define __AFR_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_afr_mem_types_ { - gf_afr_mt_iovec = gf_common_mt_end + 1, - gf_afr_mt_afr_fd_ctx_t, - gf_afr_mt_afr_local_t, - gf_afr_mt_afr_private_t, - gf_afr_mt_int32_t, - gf_afr_mt_char, - gf_afr_mt_xattr_key, - gf_afr_mt_dict_t, - gf_afr_mt_xlator_t, - gf_afr_mt_iatt, - gf_afr_mt_int, - gf_afr_mt_afr_node_character, - gf_afr_mt_sh_diff_loop_state, - gf_afr_mt_uint8_t, - gf_afr_mt_loc_t, - gf_afr_mt_entry_name, - gf_afr_mt_pump_priv, - gf_afr_mt_locked_fd, - gf_afr_mt_end + gf_afr_mt_afr_fd_ctx_t = gf_common_mt_end + 1, + gf_afr_mt_afr_private_t, + gf_afr_mt_int32_t, + gf_afr_mt_char, + gf_afr_mt_xattr_key, + gf_afr_mt_dict_t, + gf_afr_mt_xlator_t, + gf_afr_mt_afr_node_character, + gf_afr_mt_inode_ctx_t, + gf_afr_mt_shd_event_t, + gf_afr_mt_reply_t, + gf_afr_mt_subvol_healer_t, + gf_afr_mt_spbc_timeout_t, + gf_afr_mt_spb_status_t, + gf_afr_mt_empty_brick_t, + gf_afr_mt_child_latency_t, + gf_afr_mt_atomic_t, + gf_afr_mt_lk_heal_info_t, + gf_afr_mt_gf_lock, + gf_afr_mt_end }; #endif - diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h new file mode 100644 index 00000000000..e73fd997765 --- /dev/null +++ b/xlators/cluster/afr/src/afr-messages.h @@ -0,0 +1,167 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _AFR_MESSAGES_H_ +#define _AFR_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID( + AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET, AFR_MSG_QUORUM_OVERRIDE, + AFR_MSG_INVALID_CHILD_UP, AFR_MSG_SUBVOL_UP, AFR_MSG_SUBVOLS_DOWN, + AFR_MSG_ENTRY_UNLOCK_FAIL, AFR_MSG_SPLIT_BRAIN, AFR_MSG_OPEN_FAIL, + AFR_MSG_UNLOCK_FAIL, AFR_MSG_REPLACE_BRICK_STATUS, AFR_MSG_GFID_NULL, + AFR_MSG_FD_CREATE_FAILED, AFR_MSG_DICT_SET_FAILED, + AFR_MSG_EXPUNGING_FILE_OR_DIR, AFR_MSG_MIGRATION_IN_PROGRESS, + AFR_MSG_CHILD_MISCONFIGURED, AFR_MSG_VOL_MISCONFIGURED, + AFR_MSG_INTERNAL_LKS_FAILED, AFR_MSG_INVALID_FD, AFR_MSG_LOCK_INFO, + AFR_MSG_LOCK_XLATOR_NOT_LOADED, AFR_MSG_FD_CTX_GET_FAILED, + AFR_MSG_INVALID_SUBVOL, AFR_MSG_PUMP_XLATOR_ERROR, AFR_MSG_SELF_HEAL_INFO, + AFR_MSG_READ_SUBVOL_ERROR, AFR_MSG_DICT_GET_FAILED, AFR_MSG_INFO_COMMON, + AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, AFR_MSG_LOCAL_CHILD, AFR_MSG_INVALID_DATA, + AFR_MSG_INVALID_ARG, AFR_MSG_INDEX_DIR_GET_FAILED, AFR_MSG_FSYNC_FAILED, + AFR_MSG_FAVORITE_CHILD, AFR_MSG_SELF_HEAL_FAILED, + AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS, AFR_MSG_NO_CHANGELOG, + AFR_MSG_TIMER_CREATE_FAIL, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + AFR_MSG_INODE_CTX_GET_FAILED, AFR_MSG_THIN_ARB, + AFR_MSG_THIN_ARB_XATTROP_FAILED, AFR_MSG_THIN_ARB_LOC_POP_FAILED, + AFR_MSG_GET_PEND_VAL, AFR_MSG_THIN_ARB_SKIP_SHD, AFR_MSG_UNKNOWN_SET, + AFR_MSG_NO_XL_ID, AFR_MSG_SELF_HEAL_INFO_START, + AFR_MSG_SELF_HEAL_INFO_FINISH, AFR_MSG_INCRE_COUNT, + AFR_MSG_ADD_TO_OUTPUT_FAILED, AFR_MSG_SET_TIME_FAILED, + AFR_MSG_GFID_MISMATCH_DETECTED, AFR_MSG_GFID_HEAL_MSG, + AFR_MSG_THIN_ARB_LOOKUP_FAILED, AFR_MSG_DICT_CREATE_FAILED, + AFR_MSG_NO_MAJORITY_TO_RESOLVE, AFR_MSG_TYPE_MISMATCH, + AFR_MSG_SIZE_POLICY_NOT_APPLICABLE, AFR_MSG_NO_CHILD_SELECTED, + AFR_MSG_INVALID_CHILD, AFR_MSG_RESOLVE_CONFLICTING_DATA, + SERROR_GETTING_SRC_BRICK, SNO_DIFF_IN_MTIME, SNO_BIGGER_FILE, + SALL_BRICKS_UP_TO_RESOLVE, AFR_MSG_UNLOCK_FAILED, AFR_MSG_POST_OP_FAILED, + AFR_MSG_TA_FRAME_CREATE_FAILED, AFR_MSG_SET_KEY_XATTROP_FAILED, + AFR_MSG_BLOCKING_ENTRYLKS_FAILED, AFR_MSG_FOP_FAILED, + AFR_MSG_CLEAN_UP_FAILED, AFR_MSG_UNABLE_TO_FETCH, AFR_MSG_XATTR_SET_FAILED, + AFR_MSG_SPLIT_BRAIN_REPLICA, AFR_MSG_INODE_CTX_FAILED, + AFR_MSG_LOOKUP_FAILED, AFR_MSG_ALL_SUBVOLS_DOWN, + AFR_MSG_RELEASE_LOCK_FAILED, AFR_MSG_CLEAR_TIME_SPLIT_BRAIN, + AFR_MSG_READ_FAILED, AFR_MSG_LAUNCH_FAILED, AFR_MSG_READ_SUBVOL_NOT_UP, + AFR_MSG_LK_HEAL_DOM, AFR_MSG_NEW_BRICK, AFR_MSG_SPLIT_BRAIN_SET_FAILED, + AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, AFR_MSG_HEALER_SPAWN_FAILED, + AFR_MSG_ADD_CRAWL_EVENT_FAILED, AFR_MSG_NULL_DEREF, AFR_MSG_SET_PEND_XATTR, + AFR_MSG_INTERNAL_ATTR); + +#define AFR_MSG_DICT_GET_FAILED_STR "Dict get failed" +#define AFR_MSG_DICT_SET_FAILED_STR "Dict set failed" +#define AFR_MSG_HEALER_SPAWN_FAILED_STR "Healer spawn failed" +#define AFR_MSG_ADD_CRAWL_EVENT_FAILED_STR "Adding crawl event failed" +#define AFR_MSG_INVALID_ARG_STR "Invalid argument" +#define AFR_MSG_INDEX_DIR_GET_FAILED_STR "unable to get index-dir on " +#define AFR_MSG_THIN_ARB_LOOKUP_FAILED_STR "Failed lookup on file" +#define AFR_MSG_DICT_CREATE_FAILED_STR "Failed to create dict." +#define AFR_MSG_THIN_ARB_XATTROP_FAILED_STR "Xattrop failed." +#define AFR_MSG_THIN_ARB_LOC_POP_FAILED_STR \ + "Failed to populate loc for thin-arbiter" +#define AFR_MSG_GET_PEND_VAL_STR "Error getting value of pending" +#define AFR_MSG_THIN_ARB_SKIP_SHD_STR "I am not the god shd. skipping." +#define AFR_MSG_UNKNOWN_SET_STR "Unknown set" +#define AFR_MSG_NO_XL_ID_STR "xl does not have id" +#define AFR_MSG_SELF_HEAL_INFO_START_STR "starting full sweep on" +#define AFR_MSG_SELF_HEAL_INFO_FINISH_STR "finished full sweep on" +#define AFR_MSG_INCRE_COUNT_STR "Could not increment the counter." +#define AFR_MSG_ADD_TO_OUTPUT_FAILED_STR "Could not add to output" +#define AFR_MSG_SET_TIME_FAILED_STR "Could not set time" +#define AFR_MSG_GFID_HEAL_MSG_STR "Error setting gfid-heal-msg dict" +#define AFR_MSG_NO_MAJORITY_TO_RESOLVE_STR \ + "No majority to resolve gfid split brain" +#define AFR_MSG_GFID_MISMATCH_DETECTED_STR "Gfid mismatch dectected" +#define AFR_MSG_SELF_HEAL_INFO_STR "performing selfheal" +#define AFR_MSG_TYPE_MISMATCH_STR "TYPE mismatch" +#define AFR_MSG_SIZE_POLICY_NOT_APPLICABLE_STR \ + "Size policy is not applicable to directories." +#define AFR_MSG_NO_CHILD_SELECTED_STR \ + "No child selected by favorite-child policy" +#define AFR_MSG_INVALID_CHILD_STR "Invalid child" +#define AFR_MSG_RESOLVE_CONFLICTING_DATA_STR \ + "selected as authentic to resolve conflicting data" +#define SERROR_GETTING_SRC_BRICK_STR "Error getting the source brick" +#define SNO_DIFF_IN_MTIME_STR "No difference in mtime" +#define SNO_BIGGER_FILE_STR "No bigger file" +#define SALL_BRICKS_UP_TO_RESOLVE_STR \ + "All the bricks should be up to resolve the gfid split brain" +#define AFR_MSG_UNLOCK_FAILED_STR "Failed to unlock" +#define AFR_MSG_POST_OP_FAILED_STR "Post-op on thin-arbiter failed" +#define AFR_MSG_TA_FRAME_CREATE_FAILED_STR "Failed to create ta_frame" +#define AFR_MSG_SET_KEY_XATTROP_FAILED_STR "Could not set key during xattrop" +#define AFR_MSG_BLOCKING_ENTRYLKS_FAILED_STR "Blocking entrylks failed" +#define AFR_MSG_FSYNC_FAILED_STR "fsync failed" +#define AFR_MSG_QUORUM_FAIL_STR "quorum is not met" +#define AFR_MSG_FOP_FAILED_STR "Failing Fop" +#define AFR_MSG_INVALID_SUBVOL_STR "not a subvolume" +#define AFR_MSG_VOL_MISCONFIGURED_STR "Volume is dangling" +#define AFR_MSG_CHILD_MISCONFIGURED_STR \ + "replicate translator needs more than one subvolume defined" +#define AFR_MSG_CLEAN_UP_FAILED_STR "Failed to clean up healer threads" +#define AFR_MSG_QUORUM_OVERRIDE_STR "overriding quorum-count" +#define AFR_MSG_UNABLE_TO_FETCH_STR \ + "Unable to fetch afr-pending-xattr option from volfile. Falling back to " \ + "using client translator names" +#define AFR_MSG_NULL_DEREF_STR "possible NULL deref" +#define AFR_MSG_XATTR_SET_FAILED_STR "Cannot set xattr cookie key" +#define AFR_MSG_SPLIT_BRAIN_STATUS_STR "Failed to create synctask" +#define AFR_MSG_SUBVOLS_DOWN_STR "All subvolumes are not up" +#define AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR_STR \ + "Failed to cancel split-brain choice" +#define AFR_MSG_SPLIT_BRAIN_REPLICA_STR \ + "Cannot set replica. File is not in data/metadata split-brain" +#define AFR_MSG_INODE_CTX_FAILED_STR "Failed to get inode_ctx" +#define AFR_MSG_READ_SUBVOL_ERROR_STR "no read subvols" +#define AFR_MSG_LOCAL_CHILD_STR "selecting local read-child" +#define AFR_MSG_LOOKUP_FAILED_STR "Failed to lookup/create thin-arbiter id file" +#define AFR_MSG_TIMER_CREATE_FAIL_STR \ + "Cannot create timer for delayed initialization" +#define AFR_MSG_SUBVOL_UP_STR "Subvolume came back up; going online" +#define AFR_MSG_ALL_SUBVOLS_DOWN_STR \ + "All subvolumes are down. Going offline until atleast one of them is up" +#define AFR_MSG_RELEASE_LOCK_FAILED_STR "Failed to release lock" +#define AFR_MSG_INVALID_CHILD_UP_STR "Received child_up from invalid subvolume" +#define AFR_MSG_QUORUM_MET_STR "Client-quorum is met" +#define AFR_MSG_EXPUNGING_FILE_OR_DIR_STR "expunging file or dir" +#define AFR_MSG_SELF_HEAL_FAILED_STR "Invalid" +#define AFR_MSG_SPLIT_BRAIN_STR "Skipping conservative mergeon the file" +#define AFR_MSG_CLEAR_TIME_SPLIT_BRAIN_STR "clear time split brain" +#define AFR_MSG_READ_FAILED_STR "Failing read since good brick is down" +#define AFR_MSG_LAUNCH_FAILED_STR "Failed to launch synctask" +#define AFR_MSG_READ_SUBVOL_NOT_UP_STR \ + "read subvolume in this generation is not up" +#define AFR_MSG_INTERNAL_LKS_FAILED_STR \ + "Unable to work with lk-owner while attempting fop" +#define AFR_MSG_LOCK_XLATOR_NOT_LOADED_STR \ + "subvolume does not support locking. please load features/locks xlator " \ + "on server." +#define AFR_MSG_FD_CTX_GET_FAILED_STR "unable to get fd ctx" +#define AFR_MSG_INFO_COMMON_STR "fd not open on any subvolumes, aborting." +#define AFR_MSG_REPLACE_BRICK_STATUS_STR "Couldn't acquire lock on any child." +#define AFR_MSG_NEW_BRICK_STR "New brick" +#define AFR_MSG_SPLIT_BRAIN_SET_FAILED_STR \ + "Failed to set split-brain choice to -1" +#define AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED_STR \ + "Failed to determine split-brain. Aborting split-brain-choice set" +#define AFR_MSG_OPEN_FAIL_STR "Failed to open subvolume" +#define AFR_MSG_SET_PEND_XATTR_STR "Set of pending xattr" +#define AFR_MSG_INTERNAL_ATTR_STR "is an internal extended attribute" +#endif /* !_AFR_MESSAGES_H_ */ diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 8c87ce89328..64856042b65 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -1,653 +1,353 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" +#include <glusterfs/glusterfs.h> #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" -#include "statedump.h" - -#include "fd.h" - -#include "afr-inode-read.h" -#include "afr-inode-write.h" -#include "afr-dir-read.h" -#include "afr-dir-write.h" -#include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" +#include <glusterfs/dict.h> +#include <glusterfs/logging.h> +#include <glusterfs/defaults.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/statedump.h> +#include "afr-transaction.h" -int -afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +gf_boolean_t +afr_is_fd_fixable(fd_t *fd) { - afr_local_t * local = frame->local; - - AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, - local->fd); - return 0; + if (!fd || !fd->inode) + return _gf_false; + else if (fd_is_anonymous(fd)) + return _gf_false; + else if (gf_uuid_is_null(fd->inode->gfid)) + return _gf_false; + + return _gf_true; } - int -afr_open_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd) +afr_open_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - - int child_index = (long) cookie; - - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; - - int ret = 0; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - local->success_count++; - - ret = afr_fd_ctx_set (this, fd); - - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set fd ctx for fd=%p", - fd); - - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - - ret = fd_ctx_get (fd, this, &ctx); - - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not get fd ctx for fd=%p", fd); - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->opened_on[child_index] = 1; - fd_ctx->flags = local->cont.open.flags; - fd_ctx->wbflags = local->cont.open.wbflags; - } - } -unlock: - UNLOCK (&frame->lock); + afr_local_t *local = frame->local; - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if ((local->cont.open.flags & O_TRUNC) - && (local->op_ret >= 0)) { - STACK_WIND (frame, afr_open_ftruncate_cbk, - this, this->fops->ftruncate, - fd, 0); - } else { - AFR_STACK_UNWIND (open, frame, local->op_ret, - local->op_errno, local->fd); - } - } - - return 0; + AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, + local->cont.open.fd, xdata); + return 0; } - int -afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags) +afr_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - int i = 0; - int ret = -1; - - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t wind_flags = flags & (~O_TRUNC); - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - - if (afr_is_split_brain (this, loc->inode)) { - /* self-heal failed */ - op_errno = EIO; - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - frame->local = local; - call_count = local->call_count; - - loc_copy (&local->loc, loc); - - local->cont.open.flags = flags; - local->cont.open.wbflags = wbflags; - - local->fd = fd_ref (fd); - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->open, - loc, wind_flags, fd, wbflags); - - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (open, frame, op_ret, op_errno, fd); - } - - return 0; -} - - -int -afr_openfd_sh_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int ret = 0; - - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; - - int call_count = 0; - int child_index = (long) cookie; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - ret = fd_ctx_get (fd, this, &ctx); - - if (ret < 0) { - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->opened_on[child_index] = 1; - - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened successfully on subvolume %s", - local->loc.path, priv->children[child_index]->name); - } + afr_local_t *local = NULL; + int call_count = -1; + int child_index = (long)cookie; + afr_fd_ctx_t *fd_ctx = NULL; + + local = frame->local; + fd_ctx = local->fd_ctx; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + + LOCK(&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { + local->op_ret = op_ret; + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); } -out: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - int_lock->lock_cbk = local->transaction.done; - local->transaction.resume (frame, this); + call_count = --local->call_count; + } + UNLOCK(&frame->lock); + + if (call_count == 0) { + afr_handle_replies_quorum(frame, this); + if (local->op_ret == -1) { + AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, NULL, + NULL); + } else if (fd_ctx->flags & O_TRUNC) { + STACK_WIND(frame, afr_open_ftruncate_cbk, this, + this->fops->ftruncate, fd, 0, NULL); + } else { + AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, + local->cont.open.fd, local->xdata_rsp); } + } - return 0; + return 0; } - -static int -__unopened_count (int child_count, unsigned int *opened_on, unsigned char *child_up) -{ - int i; - int count = 0; - - for (i = 0; i < child_count; i++) { - if (!opened_on[i] && child_up[i]) - count++; - } - - return count; -} - - int -afr_openfd_sh_unwind (call_frame_t *frame, xlator_t *this) +afr_open_continue(call_frame_t *frame, xlator_t *this, int err) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; - - int abandon = 0; - int ret = 0; - int i; - int call_count = 0; - - priv = this->private; - local = frame->local; - - /* - * Some subvolumes might have come up on which we never - * opened this fd in the first place. Re-open fd's on those - * subvolumes now. - */ - - ret = fd_ctx_get (local->fd, this, &ctx); - - if (ret < 0) { - abandon = 1; - goto out; - } + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - LOCK (&local->fd->lock); - { - call_count = __unopened_count (priv->child_count, - fd_ctx->opened_on, - local->child_up); - for (i = 0; i < priv->child_count; i++) { - fd_ctx->pre_op_done[i] = 0; - fd_ctx->pre_op_piggyback[i] = 0; - } - } - UNLOCK (&local->fd->lock); + local = frame->local; + priv = this->private; - if (call_count == 0) { - abandon = 1; - goto out; - } - - local->call_count = call_count; + if (err) { + AFR_STACK_UNWIND(open, frame, -1, err, NULL, NULL); + } else { + local->call_count = AFR_COUNT(local->child_up, priv->child_count); + call_count = local->call_count; for (i = 0; i < priv->child_count; i++) { - if (!fd_ctx->opened_on[i] && local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "opening fd for %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_openfd_sh_open_cbk, - (void *)(long) i, - priv->children[i], - priv->children[i]->fops->open, - &local->loc, fd_ctx->flags, local->fd, - fd_ctx->wbflags); - - if (!--call_count) - break; - } - } - -out: - if (abandon) - local->transaction.resume (frame, this); - - return 0; -} - - -static int -afr_prepare_loc (call_frame_t *frame, fd_t *fd) -{ - afr_local_t *local = NULL; - char *name = NULL; - char *path = NULL; - int ret = 0; - - if ((!fd) || (!fd->inode)) - return -1; - - local = frame->local; - ret = inode_path (fd->inode, NULL, (char **)&path); - if (ret <= 0) { - gf_log (frame->this->name, GF_LOG_DEBUG, - "Unable to get path for gfid: %s", - uuid_utoa (fd->inode->gfid)); - return -1; - } - - if (local->loc.path) { - if (strcmp (path, local->loc.path)) - gf_log (frame->this->name, GF_LOG_DEBUG, - "overwriting old loc->path %s with %s", - local->loc.path, path); - GF_FREE ((char *)local->loc.path); - } - local->loc.path = path; - - name = strrchr (local->loc.path, '/'); - if (name) - name++; - local->loc.name = name; - - if (local->loc.inode) { - inode_unref (local->loc.inode); - } - local->loc.inode = inode_ref (fd->inode); - - if (local->loc.parent) { - inode_unref (local->loc.parent); + if (local->child_up[i]) { + STACK_WIND_COOKIE(frame, afr_open_cbk, (void *)(long)i, + priv->children[i], + priv->children[i]->fops->open, &local->loc, + (local->cont.open.flags & ~O_TRUNC), + local->cont.open.fd, local->xdata_req); + if (!--call_count) + break; + } } - - local->loc.parent = inode_parent (local->loc.inode, 0, NULL); - - return 0; -} - - -int -afr_openfd_sh (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - char sh_type_str[256] = {0,}; - - local = frame->local; - sh = &local->self_heal; - - GF_ASSERT (local->loc.path); - /* forcibly trigger missing-entries self-heal */ - - local->success_count = 1; - local->enoent_count = 1; - - sh->data_lock_held = _gf_true; - sh->need_data_self_heal = _gf_true; - sh->type = local->fd->inode->ia_type; - sh->background = _gf_false; - sh->unwind = afr_openfd_sh_unwind; - - afr_self_heal_type_str_get(&local->self_heal, - sh_type_str, - sizeof(sh_type_str)); - gf_log (this->name, GF_LOG_NORMAL, "%s self-heal triggered. " - "path: %s, reason: Replicate up down flush, data lock is held", - sh_type_str, local->loc.path); - - afr_self_heal (frame, this); - - return 0; + } + return 0; } - int -afr_openfd_flush_done (call_frame_t *frame, xlator_t *this) +afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; - - int _ret = -1; - - priv = this->private; - local = frame->local; - - LOCK (&local->fd->lock); - { - _ret = __fd_ctx_get (local->fd, this, &ctx); - - if (_ret < 0) { - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->down_count = priv->down_count; - fd_ctx->up_count = priv->up_count; - } + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int spb_subvol = 0; + int event_generation = 0; + int ret = 0; + int32_t op_errno = 0; + afr_fd_ctx_t *fd_ctx = NULL; + + // We can't let truncation to happen outside transaction. + + priv = this->private; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_OPEN; + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) { + op_errno = ENOMEM; + goto out; + } + + if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) { + op_errno = afr_quorum_errno(priv); + goto out; + } + + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; + + local->inode = inode_ref(loc->inode); + loc_copy(&local->loc, loc); + local->fd_ctx = fd_ctx; + fd_ctx->flags = flags; + if (xdata) + local->xdata_req = dict_ref(xdata); + + local->cont.open.flags = flags; + local->cont.open.fd = fd_ref(fd); + + ret = afr_inode_get_readable(frame, local->inode, this, NULL, + &event_generation, AFR_DATA_TRANSACTION); + if ((ret < 0) && + (afr_split_brain_read_subvol_get(local->inode, this, NULL, + &spb_subvol) == 0) && + spb_subvol < 0) { + afr_inode_refresh(frame, this, local->inode, local->inode->gfid, + afr_open_continue); + } else { + afr_open_continue(frame, this, 0); + } + + return 0; out: - UNLOCK (&local->fd->lock); - - afr_local_transaction_cleanup (local, this); + AFR_STACK_UNWIND(open, frame, -1, op_errno, fd, NULL); - gf_log (this->name, GF_LOG_TRACE, - "The up/down flush is over"); - - fd_unref (local->fd); - local->openfd_flush_cbk (frame, this); - - return 0; + return 0; } - - int -afr_openfd_xaction (call_frame_t *frame, xlator_t *this, fd_t *fd) +afr_openfd_fix_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) { - afr_local_t * local = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - local = frame->local; - - local->op = GF_FOP_FLUSH; - - local->transaction.fop = afr_openfd_sh; - local->transaction.done = afr_openfd_flush_done; - - local->transaction.start = 0; - local->transaction.len = 0; - - gf_log (this->name, GF_LOG_TRACE, - "doing up/down flush on fd=%p", - fd); + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int call_count = 0; + int child_index = (long)cookie; + + priv = this->private; + local = frame->local; + + if (op_ret >= 0) { + gf_msg_debug(this->name, 0, + "fd for %s opened " + "successfully on subvolume %s", + local->loc.path, priv->children[child_index]->name); + } else { + gf_smsg(this->name, fop_log_level(GF_FOP_OPEN, op_errno), op_errno, + AFR_MSG_OPEN_FAIL, "path=%s", local->loc.path, "subvolume=%s", + priv->children[child_index]->name, NULL); + } + + fd_ctx = local->fd_ctx; + + LOCK(&local->fd->lock); + { + if (op_ret >= 0) { + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + } else { + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } + } + UNLOCK(&local->fd->lock); - afr_transaction (frame, this, AFR_DATA_TRANSACTION); + call_count = afr_frame_return(frame); + if (call_count == 0) + AFR_STACK_DESTROY(frame); -out: - return 0; + return 0; } - - -int -afr_openfd_xaction_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) +static int +afr_fd_ctx_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + afr_private_t *priv = NULL; + int i = 0; + int count = 0; - int ret = 0; - - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - int call_count = 0; - int child_index = (long) cookie; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - ret = fd_ctx_get (fd, this, &ctx); - - if (ret < 0) { - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->opened_on[child_index] = 1; - - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened successfully on subvolume %s", - local->loc.path, priv->children[child_index]->name); - } - } -out: - UNLOCK (&frame->lock); + priv = this->private; - call_count = afr_frame_return (frame); + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) + return 0; - if (call_count == 0) { - afr_openfd_xaction (frame, this, local->fd); + LOCK(&fd->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED && + priv->child_up[i]) { + fd_ctx->opened_on[i] = AFR_FD_OPENING; + need_open[i] = 1; + count++; + } else { + need_open[i] = 0; + } } + } + UNLOCK(&fd->lock); - return 0; + return count; } - -int -afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +void +afr_fix_open(fd_t *fd, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; - - int no_open = 0; - int ret = 0; - int i; - int call_count = 0; - - priv = this->private; - local = frame->local; - - /* - * If the file is already deleted while the fd is open, no need to - * perform the openfd flush, call the flush_cbk and get out. - */ - ret = afr_prepare_loc (frame, fd); - if (ret < 0) { - local->openfd_flush_cbk (frame, this); - goto out; - } - - /* - * Some subvolumes might have come up on which we never - * opened this fd in the first place. Re-open fd's on those - * subvolumes now. - */ - - local->fd = fd_ref (fd); - - ret = fd_ctx_get (fd, this, &ctx); - - if (ret < 0) { - no_open = 1; - goto out; + afr_private_t *priv = NULL; + int i = 0; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int32_t op_errno = 0; + afr_fd_ctx_t *fd_ctx = NULL; + unsigned char *need_open = NULL; + int call_count = 0; + + priv = this->private; + + if (!afr_is_fd_fixable(fd)) + goto out; + + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) + goto out; + + need_open = alloca0(priv->child_count); + + call_count = afr_fd_ctx_need_open(fd, this, need_open); + if (!call_count) + goto out; + + frame = create_frame(this, this->ctx->pool); + if (!frame) + goto out; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->loc.inode = inode_ref(fd->inode); + ret = loc_path(&local->loc, NULL); + if (ret < 0) + goto out; + + local->fd = fd_ref(fd); + local->fd_ctx = fd_ctx; + + local->call_count = call_count; + + gf_msg_debug(this->name, 0, "need open count: %d", call_count); + + for (i = 0; i < priv->child_count; i++) { + if (!need_open[i]) + continue; + + if (IA_IFDIR == fd->inode->ia_type) { + gf_msg_debug(this->name, 0, "opening fd for dir %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i, + priv->children[i], + priv->children[i]->fops->opendir, &local->loc, + local->fd, NULL); + } else { + gf_msg_debug(this->name, 0, + "opening fd for file %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->open, + &local->loc, fd_ctx->flags & (~O_TRUNC), + local->fd, NULL); } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - LOCK (&local->fd->lock); - { - call_count = __unopened_count (priv->child_count, - fd_ctx->opened_on, - local->child_up); - } - UNLOCK (&local->fd->lock); - - if (call_count == 0) { - no_open = 1; - goto out; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (!fd_ctx->opened_on[i] && local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "opening fd for %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_openfd_xaction_open_cbk, - (void *)(long) i, - priv->children[i], - priv->children[i]->fops->open, - &local->loc, fd_ctx->flags, fd, - fd_ctx->wbflags); - - if (!--call_count) - break; - } - } + if (!--call_count) + break; + } + return; out: - if (no_open) - afr_openfd_xaction (frame, this, fd); - - return 0; + if (frame) + AFR_STACK_DESTROY(frame); } diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c new file mode 100644 index 00000000000..6fc2c75145c --- /dev/null +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -0,0 +1,494 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "afr.h" +#include "afr-transaction.h" +#include "afr-messages.h" + +void +afr_pending_read_increment(afr_private_t *priv, int child_index) +{ + if (child_index < 0 || child_index > priv->child_count) + return; + + GF_ATOMIC_INC(priv->pending_reads[child_index]); +} + +void +afr_pending_read_decrement(afr_private_t *priv, int child_index) +{ + if (child_index < 0 || child_index > priv->child_count) + return; + + GF_ATOMIC_DEC(priv->pending_reads[child_index]); +} + +void +afr_read_txn_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + afr_pending_read_decrement(priv, local->read_subvol); + local->read_subvol = subvol; + afr_pending_read_increment(priv, subvol); + local->readfn(frame, this, subvol); +} + +int +afr_read_txn_next_subvol(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int subvol = -1; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!local->readable[i]) { + /* don't even bother trying here. + just mark as attempted and move on. */ + local->read_attempted[i] = 1; + continue; + } + + if (!local->read_attempted[i]) { + subvol = i; + break; + } + } + + /* If no more subvols were available for reading, we leave + @subvol as -1, which is an indication we have run out of + readable subvols. */ + if (subvol != -1) + local->read_attempted[subvol] = 1; + afr_read_txn_wind(frame, this, subvol); + + return 0; +} + +static int +afr_ta_read_txn_done(int ret, call_frame_t *ta_frame, void *opaque) +{ + STACK_DESTROY(ta_frame->root); + return 0; +} + +static int +afr_ta_read_txn(void *opaque) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + int read_subvol = -1; + int query_child = AFR_CHILD_UNKNOWN; + int possible_bad_child = AFR_CHILD_UNKNOWN; + int ret = 0; + int op_errno = ENOMEM; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct gf_flock flock = { + 0, + }; + dict_t *xdata_req = NULL; + dict_t *xdata_rsp = NULL; + int **pending = NULL; + loc_t loc = { + 0, + }; + + frame = (call_frame_t *)opaque; + this = frame->this; + local = frame->local; + priv = this->private; + query_child = local->read_txn_query_child; + + if (query_child == AFR_CHILD_ZERO) { + possible_bad_child = AFR_CHILD_ONE; + } else if (query_child == AFR_CHILD_ONE) { + possible_bad_child = AFR_CHILD_ZERO; + } else { + /*read_txn_query_child is AFR_CHILD_UNKNOWN*/ + goto out; + } + + /* Ask the query_child to see if it blames the possibly bad one. */ + xdata_req = dict_new(); + if (!xdata_req) + goto out; + + pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!pending) + goto out; + + ret = afr_set_pending_dict(priv, xdata_req, pending); + if (ret < 0) + goto out; + + if (local->fd) { + ret = syncop_fxattrop(priv->children[query_child], local->fd, + GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp, + NULL); + } else { + ret = syncop_xattrop(priv->children[query_child], &local->loc, + GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp, + NULL); + } + if (ret || !xdata_rsp) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed xattrop for gfid %s on %s", + uuid_utoa(local->inode->gfid), + priv->children[query_child]->name); + op_errno = -ret; + goto out; + } + + if (afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, + possible_bad_child)) { + read_subvol = query_child; + goto out; + } + dict_unref(xdata_rsp); + xdata_rsp = NULL; + + /* It doesn't. So query thin-arbiter to see if it blames any data brick. */ + ret = afr_fill_ta_loc(this, &loc, _gf_true); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate thin-arbiter loc for: %s.", loc.name); + goto out; + } + flock.l_type = F_WRLCK; /*start and length are already zero. */ + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_MODIFY, &loc, F_SETLKW, &flock, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "gfid:%s: Failed to get AFR_TA_DOM_MODIFY lock on %s.", + uuid_utoa(local->inode->gfid), + priv->pending_key[THIN_ARBITER_BRICK_INDEX]); + op_errno = -ret; + goto out; + } + + ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, + GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp, + NULL); + if (ret || !xdata_rsp) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "gfid:%s: Failed xattrop on %s.", uuid_utoa(local->inode->gfid), + priv->pending_key[THIN_ARBITER_BRICK_INDEX]); + op_errno = -ret; + goto unlock; + } + + if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, query_child)) { + read_subvol = query_child; + } else { + gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB, + "Failing read for gfid %s since good brick %s is down", + uuid_utoa(local->inode->gfid), + priv->children[possible_bad_child]->name); + op_errno = EIO; + } + +unlock: + flock.l_type = F_UNLCK; + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_MODIFY, &loc, F_SETLK, &flock, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "gfid:%s: Failed to unlock AFR_TA_DOM_MODIFY lock on " + "%s.", + uuid_utoa(local->inode->gfid), + priv->pending_key[THIN_ARBITER_BRICK_INDEX]); + } +out: + if (xdata_req) + dict_unref(xdata_req); + if (xdata_rsp) + dict_unref(xdata_rsp); + if (pending) + afr_matrix_cleanup(pending, priv->child_count); + loc_wipe(&loc); + + if (read_subvol == -1) { + local->op_ret = -1; + local->op_errno = op_errno; + } + afr_read_txn_wind(frame, this, read_subvol); + return ret; +} + +void +afr_ta_read_txn_synctask(call_frame_t *frame, xlator_t *this) +{ + call_frame_t *ta_frame = NULL; + afr_local_t *local = NULL; + int ret = 0; + + local = frame->local; + ta_frame = afr_ta_frame_create(this); + if (!ta_frame) { + local->op_ret = -1; + local->op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to create ta_frame"); + goto out; + } + ret = synctask_new(this->ctx->env, afr_ta_read_txn, afr_ta_read_txn_done, + ta_frame, frame); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to launch " + "afr_ta_read_txn synctask for gfid %s.", + uuid_utoa(local->inode->gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + STACK_DESTROY(ta_frame->root); + goto out; + } + return; +out: + afr_read_txn_wind(frame, this, -1); +} + +int +afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int read_subvol = -1; + inode_t *inode = NULL; + int ret = -1; + int spb_subvol = -1; + + local = frame->local; + inode = local->inode; + priv = this->private; + + if (err) { + if (!priv->thin_arbiter_count) + goto readfn; + if (err != EINVAL) + goto readfn; + /* We need to query the good bricks and/or thin-arbiter.*/ + afr_ta_read_txn_synctask(frame, this); + return 0; + } + + read_subvol = afr_read_subvol_select_by_policy(inode, this, local->readable, + NULL); + if (read_subvol == -1) { + err = EIO; + goto readfn; + } + + if (local->read_attempted[read_subvol]) { + afr_read_txn_next_subvol(frame, this); + return 0; + } + + local->read_attempted[read_subvol] = 1; +readfn: + if (read_subvol == -1) { + ret = afr_split_brain_read_subvol_get(inode, this, frame, &spb_subvol); + if ((ret == 0) && spb_subvol >= 0) + read_subvol = spb_subvol; + } + + if (read_subvol == -1) { + AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(-1, err); + } + afr_read_txn_wind(frame, this, read_subvol); + + return 0; +} + +int +afr_read_txn_continue(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (!local->refreshed) { + local->refreshed = _gf_true; + afr_inode_refresh(frame, this, local->inode, NULL, + afr_read_txn_refresh_done); + } else { + afr_read_txn_next_subvol(frame, this); + } + + return 0; +} + +/* afr_read_txn_wipe: + + clean internal variables in @local in order to make + it possible to call afr_read_txn() multiple times from + the same frame +*/ + +void +afr_read_txn_wipe(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + local->readfn = NULL; + + if (local->inode) + inode_unref(local->inode); + + for (i = 0; i < priv->child_count; i++) { + local->read_attempted[i] = 0; + local->readable[i] = 0; + } +} + +/* + afr_read_txn: + + This is the read transaction function. The way it works: + + - Determine read-subvolume from inode ctx. + + - If read-subvolume's generation was stale, refresh ctx once by + calling afr_inode_refresh() + + Else make an attempt to read on read-subvolume. + + - If attempted read on read-subvolume fails, refresh ctx once + by calling afr_inode_refresh() + + - After ctx refresh, query read-subvolume freshly and attempt + read once. + + - If read fails, try every other readable[] subvolume before + finally giving up. readable[] elements are set by afr_inode_refresh() + based on dirty and pending flags. + + - If file is in split brain in the backend, generation will be + kept 0 by afr_inode_refresh() and readable[] will be set 0 for + all elements. Therefore reads always fail. +*/ + +int +afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_read_txn_wind_t readfn, afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *data = NULL; + unsigned char *metadata = NULL; + int read_subvol = -1; + int event_generation = 0; + int ret = -1; + + priv = this->private; + local = frame->local; + data = alloca0(priv->child_count); + metadata = alloca0(priv->child_count); + + afr_read_txn_wipe(frame, this); + + local->readfn = readfn; + local->inode = inode_ref(inode); + local->is_read_txn = _gf_true; + local->transaction.type = type; + + if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) { + local->op_ret = -1; + local->op_errno = afr_quorum_errno(priv); + goto read; + } + + if (!afr_is_consistent_io_possible(local, priv, &local->op_errno)) { + local->op_ret = -1; + goto read; + } + + if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) { + local->op_ret = -1; + local->op_errno = -afr_quorum_errno(priv); + goto read; + } + + if (priv->thin_arbiter_count && + AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) { + if (local->child_up[0]) { + local->read_txn_query_child = AFR_CHILD_ZERO; + } else if (local->child_up[1]) { + local->read_txn_query_child = AFR_CHILD_ONE; + } + afr_ta_read_txn_synctask(frame, this); + return 0; + } + + ret = afr_inode_read_subvol_get(inode, this, data, metadata, + &event_generation); + if (ret == -1) + /* very first transaction on this inode */ + goto refresh; + AFR_INTERSECT(local->readable, data, metadata, priv->child_count); + + gf_msg_debug(this->name, 0, + "%s: generation now vs cached: %d, " + "%d", + uuid_utoa(inode->gfid), local->event_generation, + event_generation); + if (afr_is_inode_refresh_reqd(inode, this, local->event_generation, + event_generation)) + /* servers have disconnected / reconnected, and possibly + rebooted, very likely changing the state of freshness + of copies */ + goto refresh; + + read_subvol = afr_read_subvol_select_by_policy(inode, this, local->readable, + NULL); + + if (read_subvol < 0 || read_subvol > priv->child_count) { + gf_msg_debug(this->name, 0, + "Unreadable subvolume %d found " + "with event generation %d for gfid %s.", + read_subvol, event_generation, uuid_utoa(inode->gfid)); + goto refresh; + } + + if (!local->child_up[read_subvol]) { + /* should never happen, just in case */ + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_READ_SUBVOL_ERROR, + "subvolume %d is the " + "read subvolume in this generation, but is not up", + read_subvol); + goto refresh; + } + + local->read_attempted[read_subvol] = 1; + +read: + afr_read_txn_wind(frame, this, read_subvol); + + return 0; + +refresh: + afr_inode_refresh(frame, this, inode, NULL, afr_read_txn_refresh_done); + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c deleted file mode 100644 index f72da774127..00000000000 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ /dev/null @@ -1,1090 +0,0 @@ -/* - Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - - -#include "glusterfs.h" -#include "afr.h" -#include "xlator.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" -#include "md5.h" - -#include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "afr-self-heal-algorithm.h" - -/* - This file contains the various self-heal algorithms -*/ - - -/* - The "full" algorithm. Copies the entire file from - source to sinks. -*/ - - -static void -sh_full_private_cleanup (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh_priv = sh->private; - - if (sh_priv) - GF_FREE (sh_priv); -} - - -static int -sh_full_loop_driver (call_frame_t *frame, xlator_t *this, gf_boolean_t is_first_call); - -static int -sh_full_loop_driver_done (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - - sh_full_private_cleanup (frame, this); - if (sh->op_failed) { - gf_log (this->name, GF_LOG_TRACE, - "full self-heal aborting on %s", - local->loc.path); - - local->self_heal.algo_abort_cbk (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "full self-heal completed on %s", - local->loc.path); - - local->self_heal.algo_completion_cbk (frame, this); - } - return 0; -} - -static int -sh_full_loop_return (call_frame_t *rw_frame, xlator_t *this, off_t offset) -{ - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; - - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = sh->private; - - AFR_STACK_DESTROY (rw_frame); - - sh_full_loop_driver (sh_frame, this, _gf_false); - - return 0; -} - - -static int -sh_full_write_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) -{ - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t *rw_sh = NULL; - - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - - int child_index = (long) cookie; - int call_count = 0; - - priv = this->private; - - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - gf_log (this->name, GF_LOG_TRACE, - "wrote %d bytes of data from %s to child %d, offset %"PRId64"", - op_ret, sh_local->loc.path, child_index, - rw_sh->offset - op_ret); - - LOCK (&sh_frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "write to %s failed on subvolume %s (%s)", - sh_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->op_failed = 1; - } - } - UNLOCK (&sh_frame->lock); - - call_count = afr_frame_return (rw_frame); - - if (call_count == 0) { - sh_full_loop_return (rw_frame, this, rw_sh->offset - op_ret); - } - - return 0; -} - - -static int -sh_full_read_cbk (call_frame_t *rw_frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref) -{ - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t *rw_sh = NULL; - - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - - int i = 0; - int call_count = 0; - - off_t offset = (long) cookie; - - priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - call_count = sh->active_sinks; - - rw_local->call_count = call_count; - - gf_log (this->name, GF_LOG_TRACE, - "read %d bytes of data from %s, offset %"PRId64"", - op_ret, sh_local->loc.path, offset); - - if (op_ret <= 0) { - sh->op_failed = 1; - sh_full_loop_return (rw_frame, this, offset); - return 0; - } - - rw_sh->offset += op_ret; - - if (sh->file_has_holes) { - if (iov_0filled (vector, count) == 0) { - /* the iter function depends on the - sh->offset already being updated - above - */ - - sh_full_loop_return (rw_frame, this, offset); - goto out; - } - } - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !sh_local->child_up[i]) - continue; - - /* this is a sink, so write to it */ - - STACK_WIND_COOKIE (rw_frame, sh_full_write_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->writev, - sh->healing_fd, vector, count, offset, - iobref); - - if (!--call_count) - break; - } - -out: - return 0; -} - - -static int -sh_full_read_write (call_frame_t *frame, xlator_t *this, off_t offset) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t *rw_sh = NULL; - afr_self_heal_t *sh = NULL; - - call_frame_t *rw_frame = NULL; - - int32_t op_errno = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - rw_frame = copy_frame (frame); - if (!rw_frame) - goto out; - - ALLOC_OR_GOTO (rw_local, afr_local_t, out); - - rw_frame->local = rw_local; - rw_sh = &rw_local->self_heal; - - rw_sh->offset = offset; - rw_sh->sh_frame = frame; - - STACK_WIND_COOKIE (rw_frame, sh_full_read_cbk, - (void *) (long) offset, - priv->children[sh->source], - priv->children[sh->source]->fops->readv, - sh->healing_fd, sh->block_size, - offset); - return 0; - -out: - sh->op_failed = 1; - - sh_full_loop_driver (frame, this, _gf_false); - - return 0; -} - - -static int -sh_full_loop_driver (call_frame_t *frame, xlator_t *this, gf_boolean_t is_first_call) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; - gf_boolean_t is_driver_done = _gf_false; - blksize_t block_size = 0; - off_t offset = 0; - - int loop = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - - LOCK (&sh_priv->lock); - { - if (_gf_false == is_first_call) - sh_priv->loops_running--; - offset = sh_priv->offset; - block_size = sh->block_size; - while ((sh->op_failed == 0) && - (sh_priv->loops_running < priv->data_self_heal_window_size) - && (sh_priv->offset < sh->file_size)) { - - loop++; - gf_log (this->name, GF_LOG_TRACE, - "spawning a loop for offset %"PRId64, - sh_priv->offset); - - sh_priv->offset += sh->block_size; - sh_priv->loops_running++; - - if (_gf_false == is_first_call) - break; - - } - if (0 == sh_priv->loops_running) { - is_driver_done = _gf_true; - } - } - UNLOCK (&sh_priv->lock); - - while (loop--) { - if (sh->op_failed) { - // op failed in other loop, stop spawning more loops - sh_full_loop_driver (frame, this, _gf_false); - } else { - sh_full_read_write (frame, this, offset); - offset += block_size; - } - } - - if (is_driver_done) { - sh_full_loop_driver_done (frame, this); - } - - return 0; -} - - -int -afr_sh_algo_full (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh_priv = GF_CALLOC (1, sizeof (*sh_priv), - gf_afr_mt_afr_private_t); - - LOCK_INIT (&sh_priv->lock); - - sh->private = sh_priv; - - local->call_count = 0; - - sh_full_loop_driver (frame, this, _gf_true); - return 0; -} - - -/* - * The "diff" algorithm. Copies only those blocks whose checksums - * don't match with those of source. - */ - - -static void -sh_diff_private_cleanup (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - - int i; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh_priv = sh->private; - - for (i = 0; i < priv->data_self_heal_window_size; i++) { - if (sh_priv->loops[i]) { - if (sh_priv->loops[i]->write_needed) - GF_FREE (sh_priv->loops[i]->write_needed); - - if (sh_priv->loops[i]->checksum) - GF_FREE (sh_priv->loops[i]->checksum); - - GF_FREE (sh_priv->loops[i]); - } - } - - if (sh_priv) { - if (sh_priv->loops) - GF_FREE (sh_priv->loops); - - GF_FREE (sh_priv); - } - - -} - - -static uint32_t -__make_cookie (int loop_index, int child_index) -{ - uint32_t ret = (loop_index << 16) | child_index; - return ret; -} - - -static int -__loop_index (uint32_t cookie) -{ - return (cookie & 0xFFFF0000) >> 16; -} - - -static int -__child_index (uint32_t cookie) -{ - return (cookie & 0x0000FFFF); -} - - -static void -sh_diff_loop_state_reset (struct sh_diff_loop_state *loop_state, int child_count) -{ - loop_state->active = _gf_false; -// loop_state->offset = 0; - - memset (loop_state->write_needed, - 0, sizeof (*loop_state->write_needed) * child_count); - - memset (loop_state->checksum, - 0, MD5_DIGEST_LEN * child_count); -} - - -static int -sh_diff_number_of_writes_needed (unsigned char *write_needed, int child_count) -{ - int writes = 0; - int i; - - for (i = 0; i < child_count; i++) { - if (write_needed[i]) - writes++; - } - - return writes; -} - - -static int -sh_diff_loop_driver_done (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - int32_t total_blocks = 0; - int32_t diff_blocks = 0; - - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - total_blocks = sh_priv->total_blocks; - diff_blocks = sh_priv->diff_blocks; - - sh_diff_private_cleanup (frame, this); - if (sh->op_failed) { - gf_log (this->name, GF_LOG_TRACE, - "diff self-heal aborting on %s", - local->loc.path); - - local->self_heal.algo_abort_cbk (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "diff self-heal completed on %s", - local->loc.path); - - - gf_log (this->name, GF_LOG_NORMAL, - "diff self-heal on %s: %d blocks of %d were different (%.2f%%)", - local->loc.path, diff_blocks, total_blocks, - ((diff_blocks * 1.0)/total_blocks) * 100); - - local->self_heal.algo_completion_cbk (frame, this); - } - - return 0; -} - -static int -sh_diff_loop_driver (call_frame_t *frame, xlator_t *this, - gf_boolean_t is_first_call, - struct sh_diff_loop_state *loop_state); - -static int -sh_diff_loop_return (call_frame_t *rw_frame, xlator_t *this, - struct sh_diff_loop_state *loop_state) -{ - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - - priv = this->private; - - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = sh->private; - - gf_log (this->name, GF_LOG_TRACE, - "loop for offset %"PRId64" returned", loop_state->offset); - - AFR_STACK_DESTROY (rw_frame); - - sh_diff_loop_driver (sh_frame, this, _gf_false, loop_state); - - return 0; -} - - -static int -sh_diff_write_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *postbuf) -{ - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - - afr_sh_algo_diff_private_t *sh_priv; - struct sh_diff_loop_state *loop_state; - - int call_count = 0; - int child_index = 0; - int loop_index = 0; - - priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = sh->private; - - child_index = __child_index ((uint32_t) (long) cookie); - loop_index = __loop_index ((uint32_t) (long) cookie); - loop_state = sh_priv->loops[loop_index]; - - gf_log (this->name, GF_LOG_TRACE, - "wrote %d bytes of data from %s to child %d, offset %"PRId64"", - op_ret, sh_local->loc.path, child_index, - loop_state->offset); - - LOCK (&sh_frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "write to %s failed on subvolume %s (%s)", - sh_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->op_failed = 1; - } - } - UNLOCK (&sh_frame->lock); - - call_count = afr_frame_return (rw_frame); - - if (call_count == 0) { - sh_diff_loop_return (rw_frame, this, loop_state); - } - - return 0; -} - - -static int -sh_diff_read_cbk (call_frame_t *rw_frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref) -{ - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - - afr_sh_algo_diff_private_t * sh_priv = NULL; - - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - - int loop_index; - struct sh_diff_loop_state *loop_state; - - uint32_t wcookie; - - int i = 0; - int call_count = 0; - - priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = sh->private; - - loop_index = __loop_index ((uint32_t) (long) cookie); - loop_state = sh_priv->loops[loop_index]; - - call_count = sh_diff_number_of_writes_needed (loop_state->write_needed, - priv->child_count); - - rw_local->call_count = call_count; - - gf_log (this->name, GF_LOG_TRACE, - "read %d bytes of data from %s, offset %"PRId64"", - op_ret, sh_local->loc.path, loop_state->offset); - - if ((op_ret <= 0) || - (call_count == 0)) { - sh_diff_loop_return (rw_frame, this, loop_state); - - return 0; - } - - if (sh->file_has_holes) { - if (iov_0filled (vector, count) == 0) { - - sh_diff_loop_return (rw_frame, this, loop_state); - goto out; - } - } - - for (i = 0; i < priv->child_count; i++) { - if (loop_state->write_needed[i]) { - wcookie = __make_cookie (loop_index, i); - - STACK_WIND_COOKIE (rw_frame, sh_diff_write_cbk, - (void *) (long) wcookie, - priv->children[i], - priv->children[i]->fops->writev, - sh->healing_fd, vector, count, - loop_state->offset, iobref); - - if (!--call_count) - break; - } - } - -out: - return 0; -} - - -static int -sh_diff_read (call_frame_t *rw_frame, xlator_t *this, - int loop_index) -{ - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - - afr_sh_algo_diff_private_t * sh_priv = NULL; - struct sh_diff_loop_state *loop_state; - - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - - uint32_t cookie; - - priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = sh->private; - - loop_state = sh_priv->loops[loop_index]; - - cookie = __make_cookie (loop_index, sh->source); - - STACK_WIND_COOKIE (rw_frame, sh_diff_read_cbk, - (void *) (long) cookie, - priv->children[sh->source], - priv->children[sh->source]->fops->readv, - sh->healing_fd, sh_priv->block_size, - loop_state->offset); - - return 0; -} - - -static int -sh_diff_checksum_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - uint32_t weak_checksum, uint8_t *strong_checksum) -{ - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t *rw_sh = NULL; - - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - - afr_sh_algo_diff_private_t * sh_priv = NULL; - - int loop_index = 0; - int child_index = 0; - struct sh_diff_loop_state *loop_state; - - int call_count = 0; - int i = 0; - int write_needed = 0; - - priv = this->private; - - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - sh_priv = sh->private; - - child_index = __child_index ((uint32_t) (long) cookie); - loop_index = __loop_index ((uint32_t) (long) cookie); - - loop_state = sh_priv->loops[loop_index]; - - if (op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "checksum on %s failed on subvolume %s (%s)", - sh_local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - - sh->op_failed = 1; - } else { - memcpy (loop_state->checksum + child_index * MD5_DIGEST_LEN, - strong_checksum, - MD5_DIGEST_LEN); - } - - call_count = afr_frame_return (rw_frame); - - if (call_count == 0) { - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !sh_local->child_up[i]) - continue; - - if (memcmp (loop_state->checksum + (i * MD5_DIGEST_LEN), - loop_state->checksum + (sh->source * MD5_DIGEST_LEN), - MD5_DIGEST_LEN)) { - /* - Checksums differ, so this block - must be written to this sink - */ - - gf_log (this->name, GF_LOG_TRACE, - "checksum on subvolume %s at offset %" - PRId64" differs from that on source", - priv->children[i]->name, loop_state->offset); - - write_needed = loop_state->write_needed[i] = 1; - } - } - - LOCK (&sh_priv->lock); - { - sh_priv->total_blocks++; - if (write_needed) - sh_priv->diff_blocks++; - } - UNLOCK (&sh_priv->lock); - - if (write_needed && !sh->op_failed) { - sh_diff_read (rw_frame, this, loop_index); - } else { - sh->offset += sh_priv->block_size; - - sh_diff_loop_return (rw_frame, this, loop_state); - } - } - - return 0; -} - - -static int -sh_diff_find_unused_loop (afr_sh_algo_diff_private_t *sh_priv, int max) -{ - int i; - - LOCK (&sh_priv->lock); - { - for (i = 0; i < max; i++) { - if (sh_priv->loops[i]->active == _gf_false) { - sh_priv->loops[i]->active = _gf_true; - break; - } - } - } - UNLOCK (&sh_priv->lock); - - if (i == max) { - gf_log ("[sh-diff]", GF_LOG_ERROR, - "no free loops found! This shouldn't happen. Please" - " report this to gluster-devel@nongnu.org"); - } - - return i; -} - - -static int -sh_diff_checksum (call_frame_t *frame, xlator_t *this, off_t offset) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * sh = NULL; - afr_self_heal_t * rw_sh = NULL; - - afr_sh_algo_diff_private_t * sh_priv = NULL; - - call_frame_t *rw_frame = NULL; - - uint32_t cookie; - int loop_index = 0; - struct sh_diff_loop_state *loop_state = NULL; - - int32_t op_errno = 0; - - int call_count = 0; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh_priv = sh->private; - - rw_frame = copy_frame (frame); - if (!rw_frame) - goto out; - - ALLOC_OR_GOTO (rw_local, afr_local_t, out); - - rw_frame->local = rw_local; - rw_sh = &rw_local->self_heal; - - rw_sh->offset = sh->offset; - rw_sh->sh_frame = frame; - - call_count = sh->active_sinks + 1; /* sinks and source */ - - rw_local->call_count = call_count; - - loop_index = sh_diff_find_unused_loop (sh_priv, priv->data_self_heal_window_size); - - loop_state = sh_priv->loops[loop_index]; - loop_state->offset = offset; - - /* we need to send both the loop index and child index, - so squeeze them both into a 32-bit number */ - - cookie = __make_cookie (loop_index, sh->source); - - STACK_WIND_COOKIE (rw_frame, sh_diff_checksum_cbk, - (void *) (long) cookie, - priv->children[sh->source], - priv->children[sh->source]->fops->rchecksum, - sh->healing_fd, - offset, sh_priv->block_size); - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !local->child_up[i]) - continue; - - cookie = __make_cookie (loop_index, i); - - STACK_WIND_COOKIE (rw_frame, sh_diff_checksum_cbk, - (void *) (long) cookie, - priv->children[i], - priv->children[i]->fops->rchecksum, - sh->healing_fd, - offset, sh_priv->block_size); - - if (!--call_count) - break; - } - - return 0; - -out: - sh->op_failed = 1; - - sh_diff_loop_driver (frame, this, _gf_false, loop_state); - - return 0; -} - - -static int -sh_diff_loop_driver (call_frame_t *frame, xlator_t *this, - gf_boolean_t is_first_call, - struct sh_diff_loop_state *loop_state) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - gf_boolean_t is_driver_done = _gf_false; - blksize_t block_size = 0; - - int loop = 0; - - off_t offset = 0; - char sh_type_str[256] = {0,}; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - - afr_self_heal_type_str_get(sh, sh_type_str, sizeof(sh_type_str)); - - LOCK (&sh_priv->lock); - { - if (loop_state) - sh_diff_loop_state_reset (loop_state, priv->child_count); - if (_gf_false == is_first_call) - sh_priv->loops_running--; - offset = sh_priv->offset; - block_size = sh_priv->block_size; - while ((0 == sh->op_failed) && - (sh_priv->loops_running < priv->data_self_heal_window_size) - && (sh_priv->offset < sh->file_size)) { - - loop++; - gf_log (this->name, GF_LOG_TRACE, - "spawning a loop for offset %"PRId64, - sh_priv->offset); - - sh_priv->offset += sh_priv->block_size; - sh_priv->loops_running++; - - if (_gf_false == is_first_call) - break; - - } - if (0 == sh_priv->loops_running) { - is_driver_done = _gf_true; - } - } - UNLOCK (&sh_priv->lock); - - while (loop--) { - if (sh->op_failed) { - // op failed in other loop, stop spawning more loops - sh_diff_loop_driver (frame, this, _gf_false, NULL); - } else { - sh_diff_checksum (frame, this, offset); - offset += block_size; - } - } - - if (is_driver_done) { - sh_diff_loop_driver_done (frame, this); - } - return 0; -} - - -int -afr_sh_algo_diff (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - - int i; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh_priv = GF_CALLOC (1, sizeof (*sh_priv), - gf_afr_mt_afr_private_t); - - sh_priv->block_size = this->ctx->page_size; - - sh->private = sh_priv; - - LOCK_INIT (&sh_priv->lock); - - local->call_count = 0; - - sh_priv->loops = GF_CALLOC (priv->data_self_heal_window_size, - sizeof (*sh_priv->loops), - gf_afr_mt_sh_diff_loop_state); - - for (i = 0; i < priv->data_self_heal_window_size; i++) { - sh_priv->loops[i] = GF_CALLOC (1, sizeof (*sh_priv->loops[i]), - gf_afr_mt_sh_diff_loop_state); - - sh_priv->loops[i]->checksum = GF_CALLOC (priv->child_count, - MD5_DIGEST_LEN, gf_afr_mt_uint8_t); - sh_priv->loops[i]->write_needed = GF_CALLOC (priv->child_count, - sizeof (*sh_priv->loops[i]->write_needed), - gf_afr_mt_char); - } - - sh_diff_loop_driver (frame, this, _gf_true, NULL); - - return 0; -} - - -struct afr_sh_algorithm afr_self_heal_algorithms[] = { - {.name = "full", .fn = afr_sh_algo_full}, - {.name = "diff", .fn = afr_sh_algo_diff}, - {0, 0}, -}; diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h deleted file mode 100644 index e45621b0ecd..00000000000 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef __AFR_SELF_HEAL_ALGORITHM_H__ -#define __AFR_SELF_HEAL_ALGORITHM_H__ - - -typedef int (*afr_sh_algo_fn) (call_frame_t *frame, - xlator_t *this); - -struct afr_sh_algorithm { - const char *name; - afr_sh_algo_fn fn; -}; - -extern struct afr_sh_algorithm afr_self_heal_algorithms[3]; - -typedef struct { - gf_lock_t lock; - unsigned int loops_running; - off_t offset; -} afr_sh_algo_full_private_t; - -struct sh_diff_loop_state { - off_t offset; - unsigned char *write_needed; - uint8_t *checksum; - gf_boolean_t active; -}; - -typedef struct { - size_t block_size; - - gf_lock_t lock; - unsigned int loops_running; - off_t offset; - - int32_t total_blocks; - int32_t diff_blocks; - - struct sh_diff_loop_state **loops; -} afr_sh_algo_diff_private_t; - -#endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 16e051c6ff9..a580a1584cc 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1,1662 +1,2934 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#include "glusterfs.h" -#include "xlator.h" -#include "byte-order.h" - #include "afr.h" -#include "afr-transaction.h" -#include "afr-self-heal-common.h" #include "afr-self-heal.h" -#include "pump.h" +#include <glusterfs/byte-order.h> +#include "protocol-common.h" +#include "afr-messages.h" +#include <glusterfs/events.h> -/** - * select_source - select a source and return it - */ +void +afr_heal_synctask(xlator_t *this, afr_local_t *local); int -afr_sh_select_source (int sources[], int child_count) +afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name, + inode_t *inode, struct afr_reply *replies, int source, + unsigned char *sources, void *gfid, int *gfid_idx) { - int i; - for (i = 0; i < child_count; i++) - if (sources[i]) - return i; + afr_private_t *priv = NULL; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + unsigned char *wind_on = NULL; + ia_type_t ia_type = IA_INVAL; + dict_t *xdata = NULL; + loc_t loc = { + 0, + }; + int ret = 0; + int i = 0; + + priv = this->private; + wind_on = alloca0(priv->child_count); + if (source >= 0 && replies[source].valid && replies[source].op_ret == 0) + ia_type = replies[source].poststat.ia_type; + + if (ia_type != IA_INVAL) + goto heal; + + /* If ia_type is still invalid, it means either + * (a)'source' was -1, i.e. parent dir pending xattrs are in split-brain + * (or) (b) The parent dir pending xattrs are all zeroes (i.e. all bricks + * are sources) and the 'source' we selected earlier might be the one where + * the file is not actually present. + * + * In both cases, let us pick a brick with a successful reply and use its + * ia_type. + * */ + for (i = 0; i < priv->child_count; i++) { + if (source == -1) { + /* case (a) above. */ + if (replies[i].valid && replies[i].op_ret == 0 && + replies[i].poststat.ia_type != IA_INVAL) { + ia_type = replies[i].poststat.ia_type; + break; + } + } else { + /* case (b) above. */ + if (i == source) + continue; + if (sources[i] && replies[i].valid && replies[i].op_ret == 0 && + replies[i].poststat.ia_type != IA_INVAL) { + ia_type = replies[i].poststat.ia_type; + break; + } + } + } - return -1; -} +heal: + /* gfid heal on those subvolumes that do not have gfid associated + * with the inode and update those replies. + */ + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; + if (gf_uuid_is_null(gfid) && + !gf_uuid_is_null(replies[i].poststat.ia_gfid) && + replies[i].poststat.ia_type == ia_type) + gfid = replies[i].poststat.ia_gfid; -/** - * sink_count - return number of sinks in sources array - */ + if (!gf_uuid_is_null(replies[i].poststat.ia_gfid) || + replies[i].poststat.ia_type != ia_type) + continue; + + wind_on[i] = 1; + } + + if (AFR_COUNT(wind_on, priv->child_count) == 0) + return 0; + + xdata = dict_new(); + if (!xdata) { + ret = -ENOMEM; + goto out; + } + + ret = dict_set_gfuuid(xdata, "gfid-req", gfid, true); + if (ret) { + ret = -ENOMEM; + goto out; + } + + frame = afr_frame_create(this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + + local = frame->local; + loc.parent = inode_ref(parent); + gf_uuid_copy(loc.pargfid, parent->gfid); + loc.name = name; + loc.inode = inode_ref(inode); + + AFR_ONLIST(wind_on, frame, afr_selfheal_discover_cbk, lookup, &loc, xdata); + + for (i = 0; i < priv->child_count; i++) { + if (!wind_on[i]) + continue; + afr_reply_wipe(&replies[i]); + afr_reply_copy(&replies[i], &local->replies[i]); + } + if (gfid_idx && (*gfid_idx == -1)) { + /*Pick a brick where the gifd heal was successful.*/ + for (i = 0; i < priv->child_count; i++) { + if (!wind_on[i]) + continue; + if (replies[i].valid && replies[i].op_ret == 0 && + !gf_uuid_is_null(replies[i].poststat.ia_gfid)) { + *gfid_idx = i; + break; + } + } + } +out: + if (gfid_idx && (*gfid_idx == -1) && (ret == 0) && local) { + ret = -afr_final_errno(local, priv); + } + loc_wipe(&loc); + if (frame) + AFR_STACK_DESTROY(frame); + if (xdata) + dict_unref(xdata); + + return ret; +} int -afr_sh_sink_count (int sources[], int child_count) +afr_gfid_sbrain_source_from_src_brick(xlator_t *this, struct afr_reply *replies, + char *src_brick) { - int i; - int sinks = 0; - for (i = 0; i < child_count; i++) - if (!sources[i]) - sinks++; - return sinks; + int i = 0; + afr_private_t *priv = NULL; + + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (strcmp(priv->children[i]->name, src_brick) == 0) + return i; + } + return -1; } int -afr_sh_source_count (int sources[], int child_count) +afr_selfheal_gfid_mismatch_by_majority(struct afr_reply *replies, + int child_count) { - int i; - int nsource = 0; + int j = 0; + int i = 0; + int votes; + + for (i = 0; i < child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + + votes = 1; + for (j = i + 1; j < child_count; j++) { + if ((!gf_uuid_compare(replies[i].poststat.ia_gfid, + replies[j].poststat.ia_gfid))) + votes++; + if (votes > child_count / 2) + return i; + } + } - for (i = 0; i < child_count; i++) - if (sources[i]) - nsource++; - return nsource; + return -1; } - int -afr_sh_supress_errenous_children (int sources[], int child_errno[], - int child_count) +afr_gfid_sbrain_source_from_bigger_file(struct afr_reply *replies, + int child_count) { - int i = 0; - - for (i = 0; i < child_count; i++) { - if (child_errno[i] && sources[i]) { - sources[i] = 0; - } - } - - return 0; + int i = 0; + int src = -1; + uint64_t size = 0; + + for (i = 0; i < child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (size < replies[i].poststat.ia_size) { + src = i; + size = replies[i].poststat.ia_size; + } else if (replies[i].poststat.ia_size == size) { + src = -1; + } + } + return src; } - -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) +int +afr_gfid_sbrain_source_from_latest_mtime(struct afr_reply *replies, + int child_count) { - afr_private_t * priv = this->private; - - char *buf = NULL; - char *ptr = NULL; - - int i, j; + int i = 0; + int src = -1; + uint32_t mtime = 0; + uint32_t mtime_nsec = 0; + + for (i = 0; i < child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; + if ((mtime < replies[i].poststat.ia_mtime) || + ((mtime == replies[i].poststat.ia_mtime) && + (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) { + src = i; + mtime = replies[i].poststat.ia_mtime; + mtime_nsec = replies[i].poststat.ia_mtime_nsec; + } else if ((mtime == replies[i].poststat.ia_mtime) && + (mtime_nsec == replies[i].poststat.ia_mtime_nsec)) { + src = -1; + } + } + return src; +} - /* 10 digits per entry + 1 space + '[' and ']' */ - buf = GF_MALLOC (priv->child_count * 11 + 8, gf_afr_mt_char); +int +afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies, + inode_t *inode, uuid_t pargfid, const char *bname, + int src_idx, int child_idx, + unsigned char *locked_on, int *src, dict_t *xdata) +{ + afr_private_t *priv = NULL; + char g1[64] = { + 0, + }; + char g2[64] = { + 0, + }; + int up_count = 0; + int heal_op = -1; + int ret = -1; + char *src_brick = NULL; + + *src = -1; + priv = this->private; + up_count = AFR_COUNT(locked_on, priv->child_count); + if (up_count != priv->child_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "All the bricks should be up to resolve the gfid split " + "barin"); + if (xdata) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + SALL_BRICKS_UP_TO_RESOLVE); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED, + "Error setting" + " gfid-heal-msg dict"); + } + goto out; + } - for (i = 0; i < priv->child_count; i++) { - ptr = buf; - ptr += sprintf (ptr, "[ "); - for (j = 0; j < priv->child_count; j++) { - ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); - } - sprintf (ptr, "]"); - gf_log (this->name, GF_LOG_TRACE, - "pending_matrix: %s", buf); - } + if (xdata) { + ret = dict_get_int32_sizen(xdata, "heal-op", &heal_op); + if (ret) + goto fav_child; + } else { + goto fav_child; + } + + switch (heal_op) { + case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: + *src = afr_gfid_sbrain_source_from_bigger_file(replies, + priv->child_count); + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + SNO_BIGGER_FILE); + if (xdata) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + SNO_BIGGER_FILE); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_DICT_SET_FAILED, + "Error" + " setting gfid-heal-msg dict"); + } + } + break; + + case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME: + *src = afr_gfid_sbrain_source_from_latest_mtime(replies, + priv->child_count); + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + SNO_DIFF_IN_MTIME); + if (xdata) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + SNO_DIFF_IN_MTIME); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_DICT_SET_FAILED, + "Error" + "setting gfid-heal-msg dict"); + } + } + break; + + case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK: + ret = dict_get_str_sizen(xdata, "child-name", &src_brick); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "Error getting the source " + "brick"); + break; + } + *src = afr_gfid_sbrain_source_from_src_brick(this, replies, + src_brick); + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + SERROR_GETTING_SRC_BRICK); + if (xdata) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + SERROR_GETTING_SRC_BRICK); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_DICT_SET_FAILED, + "Error" + " setting gfid-heal-msg dict"); + } + } + break; + + default: + break; + } + goto out; + +fav_child: + switch (priv->fav_child_policy) { + case AFR_FAV_CHILD_BY_SIZE: + *src = afr_sh_fav_by_size(this, replies, inode); + break; + case AFR_FAV_CHILD_BY_MTIME: + *src = afr_sh_fav_by_mtime(this, replies, inode); + break; + case AFR_FAV_CHILD_BY_CTIME: + *src = afr_sh_fav_by_ctime(this, replies, inode); + break; + case AFR_FAV_CHILD_BY_MAJORITY: + if (priv->child_count != 2) + *src = afr_selfheal_gfid_mismatch_by_majority( + replies, priv->child_count); + else + *src = -1; + + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "No majority to resolve " + "gfid split brain"); + } + break; + default: + break; + } - GF_FREE (buf); +out: + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "Gfid mismatch detected for <gfid:%s>/%s>, %s on %s and" + " %s on %s.", + uuid_utoa(pargfid), bname, + uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), + priv->children[child_idx]->name, + uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2), + priv->children[src_idx]->name); + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;type=gfid;file=" + "<gfid:%s>/%s>;count=2;child-%d=%s;gfid-%d=%s;" + "child-%d=%s;gfid-%d=%s", + this->ctx->cmd_args.client_pid, this->name, uuid_utoa(pargfid), + bname, child_idx, priv->children[child_idx]->name, child_idx, + uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), src_idx, + priv->children[src_idx]->name, src_idx, + uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2)); + return -1; + } + return 0; } - -void -afr_sh_build_pending_matrix (afr_private_t *priv, - int32_t *pending_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type) +int +afr_selfheal_post_op_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - int i, j, k; + afr_local_t *local = NULL; - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3]; - void *pending_raw = NULL; - int ret = -1; + local = frame->local; - unsigned char *ignorant_subvols = NULL; + local->op_ret = op_ret; + local->op_errno = op_errno; + syncbarrier_wake(&local->barrier); - ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), child_count, - gf_afr_mt_char); + return 0; +} - /* start clean */ - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - pending_matrix[i][j] = 0; - } - } +int +afr_selfheal_post_op(call_frame_t *frame, xlator_t *this, inode_t *inode, + int subvol, dict_t *xattr, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + loc_t loc = { + 0, + }; + int ret = 0; - for (i = 0; i < child_count; i++) { - pending_raw = NULL; + priv = this->private; + local = frame->local; - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], priv->pending_key[j], - &pending_raw); + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - if (ret != 0) { - /* - * There is no xattr present. This means this - * subvolume should be considered an 'ignorant' - * subvolume. - */ + local->op_ret = 0; - ignorant_subvols[i] = 1; - continue; - } + STACK_WIND(frame, afr_selfheal_post_op_cbk, priv->children[subvol], + priv->children[subvol]->fops->xattrop, &loc, + GF_XATTROP_ADD_ARRAY, xattr, xdata); - memcpy (pending, pending_raw, sizeof(pending)); - k = afr_index_for_transaction_type (type); + syncbarrier_wait(&local->barrier, 1); + if (local->op_ret < 0) + ret = -local->op_errno; - pending_matrix[i][j] = ntoh32 (pending[k]); - } - } + loc_wipe(&loc); + local->op_ret = 0; - /* - * Make all non-ignorant subvols point towards the ignorant - * subvolumes. - */ + return ret; +} - for (i = 0; i < child_count; i++) { - if (ignorant_subvols[i]) { - for (j = 0; j < child_count; j++) { - if (!ignorant_subvols[j]) - pending_matrix[j][i] += 1; - } - } +int +afr_check_stale_error(struct afr_reply *replies, afr_private_t *priv) +{ + int i = 0; + int op_errno = 0; + int tmp_errno = 0; + int stale_count = 0; + + for (i = 0; i < priv->child_count; i++) { + tmp_errno = replies[i].op_errno; + if (tmp_errno == ENOENT || tmp_errno == ESTALE) { + op_errno = afr_higher_errno(op_errno, tmp_errno); + stale_count++; } - - GF_FREE (ignorant_subvols); + } + if (stale_count != priv->child_count) + return -ENOTCONN; + else + return -op_errno; } +int +afr_sh_generic_fop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + int i = (long)cookie; + afr_local_t *local = NULL; -/** - * mark_sources: Mark all 'source' nodes and return number of source - * nodes found - * - * A node (a row in the pending matrix) belongs to one of - * three categories: - * - * M is the pending matrix. - * - * 'innocent' - M[i] is all zeroes - * 'fool' - M[i] has i'th element = 1 (self-reference) - * 'wise' - M[i] has i'th element = 0, others are 1 or 0. - * - * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is - * needed. - * - * A 'wise' node can be a source. If two 'wise' nodes conflict, it is - * a split-brain. If one wise node refers to the other but the other doesn't - * refer back, the referrer is a source. - * - * All fools are sinks, unless there are no 'wise' nodes. In that case, - * one of the fools is made a source. - */ + local = frame->local; -typedef enum { - AFR_NODE_INNOCENT, - AFR_NODE_FOOL, - AFR_NODE_WISE -} afr_node_type; + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (pre) + local->replies[i].prestat = *pre; + if (post) + local->replies[i].poststat = *post; + if (xdata) + local->replies[i].xdata = dict_ref(xdata); -typedef struct { - afr_node_type type; - int wisdom; -} afr_node_character; + syncbarrier_wake(&local->barrier); + return 0; +} -static int -afr_sh_is_innocent (int32_t *array, int child_count) +int +afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, unsigned char *healed_sinks, + struct afr_reply *replies) { - int i = 0; - int ret = 1; /* innocent until proven guilty */ + loc_t loc = { + 0, + }; - for (i = 0; i < child_count; i++) { - if (array[i]) { - ret = 0; - break; - } - } + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - return ret; -} + AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, setattr, &loc, + &replies[source].poststat, + (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME), + NULL); + loc_wipe(&loc); -static int -afr_sh_is_fool (int32_t *array, int i, int child_count) -{ - return array[i]; /* fool if accuses itself */ + return 0; } - -static int -afr_sh_is_wise (int32_t *array, int i, int child_count) +dict_t * +afr_selfheal_output_xattr(xlator_t *this, gf_boolean_t is_full_crawl, + afr_transaction_type type, int *output_dirty, + int **output_matrix, int subvol, + int **full_heal_mtx_out) { - return !array[i]; /* wise if does not accuse itself */ -} - - -static int -afr_sh_all_nodes_innocent (afr_node_character *characters, - int child_count) -{ - int i = 0; - int ret = 1; - - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_INNOCENT) { - ret = 0; - break; - } + int j = 0; + int idx = 0; + int d_idx = 0; + int ret = 0; + int *raw = 0; + dict_t *xattr = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + idx = afr_index_for_transaction_type(type); + d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION); + + xattr = dict_new(); + if (!xattr) + return NULL; + + /* clear dirty */ + raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t); + if (!raw) + goto err; + + raw[idx] = hton32(output_dirty[subvol]); + ret = dict_set_bin(xattr, AFR_DIRTY, raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + GF_FREE(raw); + goto err; + } + + /* clear/set pending */ + for (j = 0; j < priv->child_count; j++) { + raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t); + if (!raw) + goto err; + + raw[idx] = hton32(output_matrix[subvol][j]); + if (is_full_crawl) + raw[d_idx] = hton32(full_heal_mtx_out[subvol][j]); + + ret = dict_set_bin(xattr, priv->pending_key[j], raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + GF_FREE(raw); + goto err; } + } - return ret; + return xattr; +err: + if (xattr) + dict_unref(xattr); + return NULL; } - -static int -afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) +int +afr_selfheal_undo_pending(call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *undid_pending, + afr_transaction_type type, struct afr_reply *replies, + unsigned char *locked_on) { - int i = 0; - int ret = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int j = 0; + unsigned char *pending = NULL; + int *input_dirty = NULL; + int **input_matrix = NULL; + int **full_heal_mtx_in = NULL; + int **full_heal_mtx_out = NULL; + int *output_dirty = NULL; + int **output_matrix = NULL; + dict_t *xattr = NULL; + dict_t *xdata = NULL; + + priv = this->private; + local = frame->local; + + pending = alloca0(priv->child_count); + + input_dirty = alloca0(priv->child_count * sizeof(int)); + input_matrix = ALLOC_MATRIX(priv->child_count, int); + full_heal_mtx_in = ALLOC_MATRIX(priv->child_count, int); + full_heal_mtx_out = ALLOC_MATRIX(priv->child_count, int); + output_dirty = alloca0(priv->child_count * sizeof(int)); + output_matrix = ALLOC_MATRIX(priv->child_count, int); + + xdata = dict_new(); + if (!xdata) + return -1; + + afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix); + + if (local->need_full_crawl) + afr_selfheal_extract_xattr(this, replies, AFR_DATA_TRANSACTION, NULL, + full_heal_mtx_in); + + for (i = 0; i < priv->child_count; i++) + if (sinks[i] && !healed_sinks[i]) + pending[i] = 1; + + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (pending[j]) { + output_matrix[i][j] = 1; + if (type == AFR_ENTRY_TRANSACTION) + full_heal_mtx_out[i][j] = 1; + } else if (locked_on[j]) { + output_matrix[i][j] = -input_matrix[i][j]; + if (type == AFR_ENTRY_TRANSACTION) + full_heal_mtx_out[i][j] = -full_heal_mtx_in[i][j]; + } + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!pending[i]) + output_dirty[i] = -input_dirty[i]; + } + + for (i = 0; i < priv->child_count; i++) { + if (!locked_on[i]) + /* perform post-op only on subvols we had locked + and inspected on. + */ + continue; + if (undid_pending[i]) + /* We already unset the pending xattrs in + * _afr_fav_child_reset_sink_xattrs(). */ + continue; + + xattr = afr_selfheal_output_xattr(this, local->need_full_crawl, type, + output_dirty, output_matrix, i, + full_heal_mtx_out); + if (!xattr) { + continue; + } - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - ret = 1; - break; - } + if ((type == AFR_ENTRY_TRANSACTION) && (priv->esh_granular)) { + if (xdata && dict_set_int8(xdata, GF_XATTROP_PURGE_INDEX, 1)) + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_DICT_SET_FAILED, + "Failed to set" + " dict value for %s", + GF_XATTROP_PURGE_INDEX); } - return ret; -} + afr_selfheal_post_op(frame, this, inode, i, xattr, xdata); + dict_unref(xattr); + } + if (xdata) + dict_unref(xdata); -/* - * The 'wisdom' of a wise node is 0 if any other wise node accuses it. - * It is 1 if no other wise node accuses it. - * Only wise nodes with wisdom 1 are sources. - * - * If no nodes with wisdom 1 exist, a split-brain has occured. - */ + return 0; +} -static void -afr_sh_compute_wisdom (int32_t *pending_matrix[], - afr_node_character characters[], int child_count) +void +afr_reply_copy(struct afr_reply *dst, struct afr_reply *src) { - int i = 0; - int j = 0; + dict_t *xdata = NULL; + + dst->valid = src->valid; + dst->op_ret = src->op_ret; + dst->op_errno = src->op_errno; + dst->prestat = src->prestat; + dst->poststat = src->poststat; + dst->preparent = src->preparent; + dst->postparent = src->postparent; + dst->preparent2 = src->preparent2; + dst->postparent2 = src->postparent2; + if (src->xdata) + xdata = dict_ref(src->xdata); + else + xdata = NULL; + if (dst->xdata) + dict_unref(dst->xdata); + dst->xdata = xdata; + if (xdata && dict_get_str_boolean(xdata, "fips-mode-rchecksum", + _gf_false) == _gf_true) { + memcpy(dst->checksum, src->checksum, SHA256_DIGEST_LENGTH); + } else { + memcpy(dst->checksum, src->checksum, MD5_DIGEST_LENGTH); + } + dst->fips_mode_rchecksum = src->fips_mode_rchecksum; +} - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - characters[i].wisdom = 1; +void +afr_replies_copy(struct afr_reply *dst, struct afr_reply *src, int count) +{ + int i = 0; - for (j = 0; j < child_count; j++) { - if ((characters[j].type == AFR_NODE_WISE) - && pending_matrix[j][i]) { + if (dst == src) + return; - characters[i].wisdom = 0; - } - } - } - } + for (i = 0; i < count; i++) { + afr_reply_copy(&dst[i], &src[i]); + } } - -static int -afr_sh_wise_nodes_conflict (afr_node_character *characters, - int child_count) +int +afr_selfheal_fill_dirty(xlator_t *this, int *dirty, int subvol, int idx, + dict_t *xdata) { - int i = 0; - int ret = 1; - - for (i = 0; i < child_count; i++) { - if ((characters[i].type == AFR_NODE_WISE) - && characters[i].wisdom == 1) { - - /* There is atleast one bona-fide wise node */ - ret = 0; - break; - } - } + void *pending_raw = NULL; + int pending[3] = { + 0, + }; - return ret; -} + if (!dirty) + return 0; + if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw)) + return -1; -static int -afr_sh_mark_wisest_as_sources (int sources[], - afr_node_character *characters, - int child_count) -{ - int nsources = 0; + if (!pending_raw) + return -1; - int i = 0; + memcpy(pending, pending_raw, sizeof(pending)); - for (i = 0; i < child_count; i++) { - if (characters[i].wisdom == 1) { - sources[i] = 1; - nsources++; - } - } + dirty[subvol] = ntoh32(pending[idx]); - return nsources; + return 0; } - -static int -afr_sh_mark_if_size_differs (afr_self_heal_t *sh, int child_count) +int +afr_selfheal_fill_matrix(xlator_t *this, int **matrix, int subvol, int idx, + dict_t *xdata) { - int32_t ** pending_matrix; - int i, j; + int i = 0; + void *pending_raw = NULL; + int pending[3] = { + 0, + }; + afr_private_t *priv = NULL; - int size_differs = 0; + priv = this->private; - pending_matrix = sh->pending_matrix; + if (!matrix) + return 0; - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - if (!sh->buf) - break; + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw)) + continue; - if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[j]) - && (pending_matrix[i][j] == 0) - && (pending_matrix[j][i] == 0)) { + if (!pending_raw) + continue; - pending_matrix[i][j] = 1; - pending_matrix[j][i] = 1; + memcpy(pending, pending_raw, sizeof(pending)); - size_differs = 1; - } - } - } + matrix[subvol][i] = ntoh32(pending[idx]); + } - return size_differs; + return 0; } - -static int -afr_sh_mark_biggest_fool_as_source (afr_self_heal_t *sh, - afr_node_character *characters, - int child_count) +int +afr_selfheal_extract_xattr(xlator_t *this, struct afr_reply *replies, + afr_transaction_type type, int *dirty, int **matrix) { - int i = 0; - int biggest = 0; + afr_private_t *priv = NULL; + int i = 0; + dict_t *xdata = NULL; + int idx = -1; - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_FOOL) { - biggest = i; - break; - } - } + idx = afr_index_for_transaction_type(type); - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; + priv = this->private; - if (!sh->buf) - break; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; - if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) { - biggest = i; - } - } + if (!replies[i].xdata) + continue; - sh->sources[biggest] = 1; + xdata = replies[i].xdata; - return 1; -} + afr_selfheal_fill_dirty(this, dirty, i, idx, xdata); + afr_selfheal_fill_matrix(this, matrix, i, idx, xdata); + } + return 0; +} -static int -afr_sh_mark_biggest_as_source (afr_self_heal_t *sh, int child_count) +/* + * If by chance there are multiple sources with differing sizes, select + * the largest file as the source. + * + * This can happen if data was directly modified in the backend or for snapshots + */ +void +afr_mark_largest_file_as_source(xlator_t *this, unsigned char *sources, + struct afr_reply *replies) { - int biggest = 0; - int i; - - for (i = 0; i < child_count; i++) { - if (!sh->buf) - break; - - if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) { - biggest = i; - } + int i = 0; + afr_private_t *priv = NULL; + uint64_t size = 0; + + /* Find source with biggest file size */ + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (!replies[i].valid || replies[i].op_ret != 0) { + sources[i] = 0; + continue; } - - sh->sources[biggest] = 1; - - return 1; + if (size <= replies[i].poststat.ia_size) { + size = replies[i].poststat.ia_size; + } + } + + /* Mark sources with less size as not source */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (size > replies[i].poststat.ia_size) + sources[i] = 0; + } } - -static int -afr_sh_mark_loweia_uid_as_source (afr_self_heal_t *sh, int child_count) +void +afr_mark_latest_mtime_file_as_source(xlator_t *this, unsigned char *sources, + struct afr_reply *replies) { - uid_t smallest = 0; - int i; - - for (i = 0; i < child_count; i++) { - if (!sh->buf) - break; - - if (sh->buf[i].ia_uid < sh->buf[smallest].ia_uid) { - smallest = i; - } + int i = 0; + afr_private_t *priv = NULL; + uint32_t mtime = 0; + uint32_t mtime_nsec = 0; + + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (!replies[i].valid || replies[i].op_ret != 0) { + sources[i] = 0; + continue; } - - sh->sources[smallest] = 1; - - return 1; + if ((mtime < replies[i].poststat.ia_mtime) || + ((mtime == replies[i].poststat.ia_mtime) && + (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) { + mtime = replies[i].poststat.ia_mtime; + mtime_nsec = replies[i].poststat.ia_mtime_nsec; + } + } + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if ((mtime > replies[i].poststat.ia_mtime) || + ((mtime == replies[i].poststat.ia_mtime) && + (mtime_nsec > replies[i].poststat.ia_mtime_nsec))) { + sources[i] = 0; + } + } } - -int -afr_sh_mark_sources (afr_self_heal_t *sh, int child_count, - afr_self_heal_type type) +void +afr_mark_active_sinks(xlator_t *this, unsigned char *sources, + unsigned char *locked_on, unsigned char *sinks) { - int i = 0; - - int32_t ** pending_matrix; - int * sources; - - int size_differs = 0; - - pending_matrix = sh->pending_matrix; - sources = sh->sources; + int i = 0; + afr_private_t *priv = NULL; - int nsources = 0; + priv = this->private; - /* stores the 'characters' (innocent, fool, wise) of the nodes */ - afr_node_character * - characters = GF_CALLOC (sizeof (afr_node_character), - child_count, - gf_afr_mt_afr_node_character) ; - - /* start clean */ - for (i = 0; i < child_count; i++) { - sources[i] = 0; - } + for (i = 0; i < priv->child_count; i++) { + if (!sources[i] && locked_on[i]) + sinks[i] = 1; + else + sinks[i] = 0; + } +} - for (i = 0; i < child_count; i++) { - if (afr_sh_is_innocent (pending_matrix[i], child_count)) { - characters[i].type = AFR_NODE_INNOCENT; +gf_boolean_t +afr_dict_contains_heal_op(call_frame_t *frame) +{ + afr_local_t *local = NULL; + dict_t *xdata_req = NULL; + int ret = 0; + int heal_op = -1; + + local = frame->local; + xdata_req = local->xdata_req; + ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op); + if (ret) + return _gf_false; + if (local->xdata_rsp == NULL) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) + return _gf_true; + } + ret = dict_set_sizen_str_sizen(local->xdata_rsp, "sh-fail-msg", + SFILE_NOT_IN_SPLIT_BRAIN); + + return _gf_true; +} - } else if (afr_sh_is_fool (pending_matrix[i], i, child_count)) { - characters[i].type = AFR_NODE_FOOL; +gf_boolean_t +afr_can_decide_split_brain_source_sinks(struct afr_reply *replies, + int child_count) +{ + int i = 0; - } else if (afr_sh_is_wise (pending_matrix[i], i, child_count)) { - characters[i].type = AFR_NODE_WISE; + for (i = 0; i < child_count; i++) + if (replies[i].valid != 1 || replies[i].op_ret != 0) + return _gf_false; - } else { - gf_log ("[module:replicate]", GF_LOG_ERROR, - "Could not determine the state of subvolume %d!" - " (This message should never appear." - " Please file a bug report to " - "<gluster-devel@nongnu.org>.)", i); - } - } + return _gf_true; +} - if (type == AFR_SELF_HEAL_DATA) { - size_differs = afr_sh_mark_if_size_differs (sh, child_count); +int +afr_mark_split_brain_source_sinks_by_heal_op( + call_frame_t *frame, xlator_t *this, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies, afr_transaction_type type, int heal_op) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata_req = NULL; + dict_t *xdata_rsp = NULL; + int ret = 0; + int i = 0; + char *name = NULL; + int source = -1; + + local = frame->local; + priv = this->private; + xdata_req = local->xdata_req; + + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + if (sources[i] || !sinks[i] || !healed_sinks[i]) { + ret = -1; + goto out; + } + } + if (local->xdata_rsp == NULL) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) { + ret = -1; + goto out; } - - if ((type == AFR_SELF_HEAL_METADATA) - && afr_sh_all_nodes_innocent (characters, child_count)) { - - nsources = afr_sh_mark_loweia_uid_as_source (sh, child_count); + } + xdata_rsp = local->xdata_rsp; + + if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SBRAIN_HEAL_NO_GO_MSG); + ret = -1; + goto out; + } + + for (i = 0; i < priv->child_count; i++) + if (locked_on[i]) + sources[i] = 1; + switch (heal_op) { + case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: + if (type == AFR_METADATA_TRANSACTION) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SUSE_SOURCE_BRICK_TO_HEAL); + if (!ret) + ret = -1; + goto out; + } + afr_mark_largest_file_as_source(this, sources, replies); + if (AFR_COUNT(sources, priv->child_count) != 1) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SNO_BIGGER_FILE); + if (!ret) + ret = -1; + goto out; + } + break; + case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME: + if (type == AFR_METADATA_TRANSACTION) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SUSE_SOURCE_BRICK_TO_HEAL); + if (!ret) + ret = -1; + goto out; + } + afr_mark_latest_mtime_file_as_source(this, sources, replies); + if (AFR_COUNT(sources, priv->child_count) != 1) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SNO_DIFF_IN_MTIME); + if (!ret) + ret = -1; + goto out; + } + break; + case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK: + ret = dict_get_str_sizen(xdata_req, "child-name", &name); + if (ret) + goto out; + source = afr_get_child_index_from_name(this, name); + if (source < 0) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SINVALID_BRICK_NAME); + if (!ret) + ret = -1; + goto out; + } + if (locked_on[source] != 1) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SBRICK_IS_NOT_UP); + if (!ret) + ret = -1; goto out; + } + memset(sources, 0, sizeof(*sources) * priv->child_count); + sources[source] = 1; + break; + default: + ret = -1; + goto out; + } + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + break; } + } + sinks[source] = 0; + healed_sinks[source] = 0; + ret = source; +out: + if (ret < 0) + memset(sources, 0, sizeof(*sources) * priv->child_count); + return ret; +} - if (afr_sh_all_nodes_innocent (characters, child_count)) { - if (size_differs) { - nsources = afr_sh_mark_biggest_as_source (sh, - child_count); - } - - } else if (afr_sh_wise_nodes_exist (characters, child_count)) { - afr_sh_compute_wisdom (pending_matrix, characters, child_count); - - if (afr_sh_wise_nodes_conflict (characters, child_count)) { - /* split-brain */ - - nsources = -1; - goto out; - - } else { - nsources = afr_sh_mark_wisest_as_sources (sources, - characters, - child_count); +int +afr_sh_fav_by_majority(xlator_t *this, struct afr_reply *replies, + inode_t *inode) +{ + afr_private_t *priv; + int vote_count = -1; + int fav_child = -1; + int i = 0; + int k = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid == 1) { + gf_msg_debug(this->name, 0, + "Child:%s mtime_sec = %" PRId64 ", size = %" PRIu64 + " for gfid %s", + priv->children[i]->name, replies[i].poststat.ia_mtime, + replies[i].poststat.ia_size, uuid_utoa(inode->gfid)); + vote_count = 0; + for (k = 0; k < priv->child_count; k++) { + if ((replies[k].poststat.ia_mtime == + replies[i].poststat.ia_mtime) && + (replies[k].poststat.ia_size == + replies[i].poststat.ia_size)) { + vote_count++; } - } else { - nsources = afr_sh_mark_biggest_fool_as_source (sh, characters, - child_count); + } + if (vote_count > priv->child_count / 2) { + fav_child = i; + break; + } } - -out: - GF_FREE (characters); - - return nsources; + } + return fav_child; } - -void -afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], int success[], - int child_count, afr_transaction_type type) -{ - int i = 0; - int j = 0; - int k = 0; - - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3]; - void *pending_raw = NULL; - int ret = 0; - - /* start clean */ - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - delta_matrix[i][j] = 0; - } +/* + * afr_sh_fav_by_mtime: Choose favorite child by mtime. + */ +int +afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode) +{ + afr_private_t *priv; + int fav_child = -1; + int i = 0; + uint32_t cmp_mtime = 0; + uint32_t cmp_mtime_nsec = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid == 1) { + gf_msg_debug(this->name, 0, + "Child:%s mtime = %" PRId64 + ", mtime_nsec = %d for " + "gfid %s", + priv->children[i]->name, replies[i].poststat.ia_mtime, + replies[i].poststat.ia_mtime_nsec, + uuid_utoa(inode->gfid)); + if (replies[i].poststat.ia_mtime > cmp_mtime) { + cmp_mtime = replies[i].poststat.ia_mtime; + cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec; + fav_child = i; + } else if ((replies[i].poststat.ia_mtime == cmp_mtime) && + (replies[i].poststat.ia_mtime_nsec > cmp_mtime_nsec)) { + cmp_mtime = replies[i].poststat.ia_mtime; + cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec; + fav_child = i; + } } + } + return fav_child; +} - for (i = 0; i < child_count; i++) { - if (pending_raw) - pending_raw = NULL; - - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], priv->pending_key[j], - &pending_raw); - if (ret < 0) - gf_log ("afr_sh_pending_to_delta", - GF_LOG_DEBUG, - "Unable to get dict value."); - if (!success[j]) - continue; - - k = afr_index_for_transaction_type (type); - - if (pending_raw != NULL) { - memcpy (pending, pending_raw, sizeof(pending)); - delta_matrix[i][j] = -(ntoh32 (pending[k])); - } else { - delta_matrix[i][j] = 0; - } +/* + * afr_sh_fav_by_ctime: Choose favorite child by ctime. + */ +int +afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode) +{ + afr_private_t *priv; + int fav_child = -1; + int i = 0; + uint32_t cmp_ctime = 0; + uint32_t cmp_ctime_nsec = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid == 1) { + gf_msg_debug(this->name, 0, + "Child:%s ctime = %" PRId64 + ", ctime_nsec = %d for " + "gfid %s", + priv->children[i]->name, replies[i].poststat.ia_ctime, + replies[i].poststat.ia_ctime_nsec, + uuid_utoa(inode->gfid)); + if (replies[i].poststat.ia_ctime > cmp_ctime) { + cmp_ctime = replies[i].poststat.ia_ctime; + cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec; + fav_child = i; + } else if ((replies[i].poststat.ia_ctime == cmp_ctime) && + (replies[i].poststat.ia_ctime_nsec > cmp_ctime_nsec)) { + cmp_ctime = replies[i].poststat.ia_ctime; + cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec; + fav_child = i; + } + } + } + return fav_child; +} - } +/* + * afr_sh_fav_by_size: Choose favorite child by size + * when not all files are of zero size. + */ +int +afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode) +{ + afr_private_t *priv; + int fav_child = -1; + int i = 0; + uint64_t cmp_sz = 0; + + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) { + continue; + } + gf_msg_debug(this->name, 0, + "Child:%s file size = %" PRIu64 " for gfid %s", + priv->children[i]->name, replies[i].poststat.ia_size, + uuid_utoa(inode->gfid)); + if (replies[i].poststat.ia_type == IA_IFDIR) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + "Cannot perform selfheal on %s. " + "Size policy is not applicable to directories.", + uuid_utoa(inode->gfid)); + break; } + if (replies[i].poststat.ia_size > cmp_sz) { + cmp_sz = replies[i].poststat.ia_size; + fav_child = i; + } else if (replies[i].poststat.ia_size == cmp_sz) { + fav_child = -1; + } + } + if (fav_child == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "No bigger file"); + } + return fav_child; } +int +afr_sh_get_fav_by_policy(xlator_t *this, struct afr_reply *replies, + inode_t *inode, char **policy_str) +{ + afr_private_t *priv = NULL; + int fav_child = -1; + + priv = this->private; + if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) { + return -1; + } + + switch (priv->fav_child_policy) { + case AFR_FAV_CHILD_BY_SIZE: + fav_child = afr_sh_fav_by_size(this, replies, inode); + if (policy_str && fav_child >= 0) { + *policy_str = "SIZE"; + } + break; + case AFR_FAV_CHILD_BY_CTIME: + fav_child = afr_sh_fav_by_ctime(this, replies, inode); + if (policy_str && fav_child >= 0) { + *policy_str = "CTIME"; + } + break; + case AFR_FAV_CHILD_BY_MTIME: + fav_child = afr_sh_fav_by_mtime(this, replies, inode); + if (policy_str && fav_child >= 0) { + *policy_str = "MTIME"; + } + break; + case AFR_FAV_CHILD_BY_MAJORITY: + fav_child = afr_sh_fav_by_majority(this, replies, inode); + if (policy_str && fav_child >= 0) { + *policy_str = "MAJORITY"; + } + break; + case AFR_FAV_CHILD_NONE: + default: + break; + } + + return fav_child; +} int -afr_sh_delta_to_xattr (afr_private_t *priv, - int32_t *delta_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type) +afr_mark_split_brain_source_sinks_by_policy( + call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies, afr_transaction_type type) { - int i = 0; - int j = 0; - int k = 0; + afr_private_t *priv = NULL; + int fav_child = -1; + char mtime_str[256]; + char ctime_str[256]; + char *policy_str = NULL; + struct tm *tm_ptr; + time_t time; + + priv = this->private; + + fav_child = afr_sh_get_fav_by_policy(this, replies, inode, &policy_str); + if (fav_child == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + "No child selected by favorite-child policy."); + } else if (fav_child > priv->child_count - 1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + "Invalid child (%d) " + "selected by policy %s.", + fav_child, policy_str); + } else if (fav_child >= 0) { + time = replies[fav_child].poststat.ia_mtime; + tm_ptr = localtime(&time); + strftime(mtime_str, sizeof(mtime_str), "%Y-%m-%d %H:%M:%S", tm_ptr); + time = replies[fav_child].poststat.ia_ctime; + tm_ptr = localtime(&time); + strftime(ctime_str, sizeof(ctime_str), "%Y-%m-%d %H:%M:%S", tm_ptr); + + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + "Source %s selected as authentic to resolve conflicting data " + "in file (gfid:%s) by %s (%" PRIu64 + " bytes @ %s mtime, %s " + "ctime).", + priv->children[fav_child]->name, uuid_utoa(inode->gfid), + policy_str, replies[fav_child].poststat.ia_size, mtime_str, + ctime_str); + + sources[fav_child] = 1; + sinks[fav_child] = 0; + healed_sinks[fav_child] = 0; + } + return fav_child; +} - int ret = 0; +gf_boolean_t +afr_is_file_empty_on_all_children(afr_private_t *priv, + struct afr_reply *replies) +{ + int i = 0; - int32_t *pending = 0; + for (i = 0; i < priv->child_count; i++) { + if ((!replies[i].valid) || (replies[i].op_ret != 0) || + (replies[i].poststat.ia_size != 0)) + return _gf_false; + } - for (i = 0; i < child_count; i++) { - if (!xattr[i]) - continue; + return _gf_true; +} - for (j = 0; j < child_count; j++) { - pending = GF_CALLOC (sizeof (int32_t), 3, - gf_afr_mt_int32_t); - /* 3 = data+metadata+entry */ +int +afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies, + afr_transaction_type type) +{ + int source = -1; + int i = 0; + afr_private_t *priv = this->private; + struct iatt stbuf = { + 0, + }; + + if ((AFR_COUNT(locked_on, priv->child_count) < priv->child_count) || + (afr_success_count(replies, priv->child_count) < priv->child_count)) + return -1; + + if (type == AFR_DATA_TRANSACTION) { + if (!afr_is_file_empty_on_all_children(priv, replies)) + return -1; + goto mark; + } + + /*For AFR_METADATA_TRANSACTION, metadata must be same on all bricks.*/ + stbuf = replies[0].poststat; + for (i = 1; i < priv->child_count; i++) { + if ((!IA_EQUAL(stbuf, replies[i].poststat, type)) || + (!IA_EQUAL(stbuf, replies[i].poststat, uid)) || + (!IA_EQUAL(stbuf, replies[i].poststat, gid)) || + (!IA_EQUAL(stbuf, replies[i].poststat, prot))) + return -1; + } + for (i = 1; i < priv->child_count; i++) { + if (!afr_xattrs_are_equal(replies[0].xdata, replies[i].xdata)) + return -1; + } + +mark: + /* data/metadata is same on all bricks. Pick one of them as source. Rest + * are sinks.*/ + for (i = 0; i < priv->child_count; i++) { + if (source == -1) { + source = i; + sources[i] = 1; + sinks[i] = 0; + healed_sinks[i] = 0; + continue; + } + sources[i] = 0; + sinks[i] = 1; + healed_sinks[i] = 1; + } - k = afr_index_for_transaction_type (type); + return source; +} - pending[k] = hton32 (delta_matrix[i][j]); +/* Return a source depending on the type of heal_op, and set sources[source], + * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so + * only if the following condition is met: + * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1)) + * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and + * sinks[node] are 1. This should be the case if the file is in split-brain. + */ +int +afr_mark_split_brain_source_sinks( + call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies, afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata_req = NULL; + int heal_op = -1; + int ret = -1; + int source = -1; + + local = frame->local; + priv = this->private; + xdata_req = local->xdata_req; + + source = afr_mark_source_sinks_if_file_empty( + this, sources, sinks, healed_sinks, locked_on, replies, type); + if (source >= 0) + return source; + + ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op); + if (ret) + goto autoheal; + + source = afr_mark_split_brain_source_sinks_by_heal_op( + frame, this, sources, sinks, healed_sinks, locked_on, replies, type, + heal_op); + return source; + +autoheal: + /* Automatically heal if fav_child_policy is set. */ + if (priv->fav_child_policy != AFR_FAV_CHILD_NONE) { + source = afr_mark_split_brain_source_sinks_by_policy( + frame, this, inode, sources, sinks, healed_sinks, locked_on, + replies, type); + if (source != -1) { + ret = dict_set_int32_sizen(xdata_req, "fav-child-policy", 1); + if (ret) + return -1; + } + } - ret = dict_set_bin (xattr[i], priv->pending_key[j], - pending, - 3 * sizeof (int32_t)); - if (ret < 0) - gf_log ("afr_sh_delta_to_xattr", - GF_LOG_WARNING, - "Unable to set dict value."); - } - } - return 0; + return source; } - int -afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this) +_afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this, + inode_t *inode, int source, + unsigned char *healed_sinks, + unsigned char *undid_pending, + afr_transaction_type type, + unsigned char *locked_on, + struct afr_reply *replies) { - afr_private_t *priv = NULL; - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3]; - void *pending_raw = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int *input_dirty = NULL; + int **input_matrix = NULL; + int *output_dirty = NULL; + int **output_matrix = NULL; + dict_t *xattr = NULL; + dict_t *xdata = NULL; + int i = 0; + + priv = this->private; + local = frame->local; + + if (!dict_get_sizen(local->xdata_req, "fav-child-policy")) + return 0; - int ret = -1; - int i = 0; - int j = 0; + xdata = dict_new(); + if (!xdata) + return -1; - priv = this->private; + input_dirty = alloca0(priv->child_count * sizeof(int)); + input_matrix = ALLOC_MATRIX(priv->child_count, int); + output_dirty = alloca0(priv->child_count * sizeof(int)); + output_matrix = ALLOC_MATRIX(priv->child_count, int); - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); + afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix); - if (ret != 0) - return 0; + for (i = 0; i < priv->child_count; i++) { + if (i == source || !healed_sinks[i]) + continue; + output_dirty[i] = -input_dirty[i]; + output_matrix[i][source] = -input_matrix[i][source]; + } - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i] || !locked_on[i]) + continue; + xattr = afr_selfheal_output_xattr(this, _gf_false, type, output_dirty, + output_matrix, i, NULL); - if (pending[j]) - return 1; - } + afr_selfheal_post_op(frame, this, inode, i, xattr, xdata); - return 0; -} + undid_pending[i] = 1; + dict_unref(xattr); + } + if (xdata) + dict_unref(xdata); -int -afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this) + return 0; +} + +gf_boolean_t +afr_does_witness_exist(xlator_t *this, uint64_t *witness) { - afr_private_t *priv = NULL; - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3]; - void *pending_raw = NULL; + int i = 0; + afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; + priv = this->private; - priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (witness[i]) + return _gf_true; + } + return _gf_false; +} - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); +unsigned int +afr_get_quorum_count(afr_private_t *priv) +{ + if (priv->quorum_count == AFR_QUORUM_AUTO) { + return priv->child_count / 2 + 1; + } else { + return priv->quorum_count; + } +} - if (ret != 0) - return 0; +void +afr_selfheal_post_op_failure_accounting(afr_private_t *priv, char *accused, + unsigned char *sources, + unsigned char *locked_on) +{ + int i = 0; + unsigned int quorum_count = 0; - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); + if (AFR_COUNT(sources, priv->child_count) != 0) + return; - if (pending[j]) - return 1; + quorum_count = afr_get_quorum_count(priv); + for (i = 0; i < priv->child_count; i++) { + if ((accused[i] < quorum_count) && locked_on[i]) { + sources[i] = 1; } - - return 0; + } + return; } +/* + * This function determines if a self-heal is required for a given inode, + * and if needed, in what direction. + * + * locked_on[] is the array representing servers which have been locked and + * from which xattrs have been fetched for analysis. + * + * The output of the function is by filling the arrays sources[] and sinks[]. + * + * sources[i] is set if i'th server is an eligible source for a selfheal. + * + * sinks[i] is set if i'th server needs to be healed. + * + * if sources[0..N] are all set, there is no need for a selfheal. + * + * if sinks[0..N] are all set, the inode is in split brain. + * + */ int -afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this) +afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + uint64_t *witness, unsigned char *pflag) { - afr_private_t *priv = NULL; - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3]; - void *pending_raw = NULL; + afr_private_t *priv = NULL; + int i = 0; + int j = 0; + int *dirty = NULL; /* Denotes if dirty xattr is set */ + int **matrix = NULL; /* Changelog matrix */ + char *accused = NULL; /* Accused others without any self-accusal */ + char *pending = NULL; /* Have pending operations on others */ + char *self_accused = NULL; /* Accused itself */ + + priv = this->private; + + dirty = alloca0(priv->child_count * sizeof(int)); + accused = alloca0(priv->child_count); + pending = alloca0(priv->child_count); + self_accused = alloca0(priv->child_count); + matrix = ALLOC_MATRIX(priv->child_count, int); + memset(witness, 0, sizeof(*witness) * priv->child_count); + + /* First construct the pending matrix for further analysis */ + afr_selfheal_extract_xattr(this, replies, type, dirty, matrix); + + if (pflag) { + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) + if (matrix[i][j]) + *pflag |= PFLAG_PENDING; + if (*pflag) + break; + } + } + + if (afr_success_count(replies, priv->child_count) < priv->child_count) { + /* Treat this just like locks not being acquired */ + return -ENOTCONN; + } + + /* short list all self-accused */ + for (i = 0; i < priv->child_count; i++) { + if (matrix[i][i]) + self_accused[i] = 1; + } + + /* Next short list all accused to exclude them from being sources */ + /* Self-accused can't accuse others as they are FOOLs */ + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) { + if (!self_accused[i]) + accused[j] += 1; + if (i != j) + pending[i] += 1; + } + } + } - int ret = -1; - int i = 0; - int j = 0; + /* Short list all non-accused as sources */ + for (i = 0; i < priv->child_count; i++) { + if (!accused[i] && locked_on[i]) + sources[i] = 1; + else + sources[i] = 0; + } + + /* Everyone accused by non-self-accused sources are sinks */ + memset(sinks, 0, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (self_accused[i]) + continue; + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) + sinks[j] = 1; + } + } + + /* For breaking ties provide with number of fops they witnessed */ + + /* + * count the pending fops witnessed from itself to others when it is + * self-accused + */ + for (i = 0; i < priv->child_count; i++) { + if (!self_accused[i]) + continue; + for (j = 0; j < priv->child_count; j++) { + if (i == j) + continue; + witness[i] += matrix[i][j]; + } + } - priv = this->private; + if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) + afr_selfheal_post_op_failure_accounting(priv, accused, sources, + locked_on); + /* If no sources, all locked nodes are sinks - split brain */ + if (AFR_COUNT(sources, priv->child_count) == 0) { + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + sinks[i] = 1; + } + if (pflag) + *pflag |= PFLAG_SBRAIN; + } + + /* One more class of witness similar to dirty in v2 is where no pending + * exists but we have self-accusing markers. This can happen in afr-v1 + * if the brick crashes just after doing xattrop on self but + * before xattrop on the other xattrs on the brick in pre-op. */ + if (AFR_COUNT(pending, priv->child_count) == 0) { for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); + if (self_accused[i]) + witness[i] += matrix[i][i]; + } + } else { + /* In afr-v1 if a file is self-accused and has pending + * operations on others then it is similar to 'dirty' in afr-v2. + * Consider such cases as witness. + */ + for (i = 0; i < priv->child_count; i++) { + if (self_accused[i] && pending[i]) + witness[i] += matrix[i][i]; + } + } - if (ret != 0) - return 0; + /* count the number of dirty fops witnessed */ + for (i = 0; i < priv->child_count; i++) + witness[i] += dirty[i]; - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); + return 0; +} - if (pending[j]) - return 1; +void +afr_log_selfheal(uuid_t gfid, xlator_t *this, int ret, char *type, int source, + unsigned char *sources, unsigned char *healed_sinks) +{ + char *status = NULL; + char *sinks_str = NULL; + char *p = NULL; + char *sources_str = NULL; + char *q = NULL; + afr_private_t *priv = NULL; + gf_loglevel_t loglevel = GF_LOG_NONE; + int i = 0; + + priv = this->private; + sinks_str = alloca0(priv->child_count * 8); + p = sinks_str; + sources_str = alloca0(priv->child_count * 8); + q = sources_str; + for (i = 0; i < priv->child_count; i++) { + if (healed_sinks[i]) + p += sprintf(p, "%d ", i); + if (sources[i]) { + if (source == i) { + q += sprintf(q, "[%d] ", i); + } else { + q += sprintf(q, "%d ", i); + } } - - return 0; + } + + if (ret < 0) { + status = "Failed"; + loglevel = GF_LOG_DEBUG; + } else { + status = "Completed"; + loglevel = GF_LOG_INFO; + } + + gf_msg(this->name, loglevel, 0, AFR_MSG_SELF_HEAL_INFO, + "%s %s selfheal on %s. " + "sources=%s sinks=%s", + status, type, uuid_utoa(gfid), sources_str, sinks_str); } - -/** - * is_matrix_zero - return true if pending matrix is all zeroes - */ - int -afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count) +afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *parbuf) { - int i, j; - - for (i = 0; i < child_count; i++) - for (j = 0; j < child_count; j++) - if (pending_matrix[i][j]) - return 0; - return 1; + afr_local_t *local = NULL; + int i = -1; + GF_UNUSED int ret = -1; + int8_t need_heal = 1; + + local = frame->local; + i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (buf) + local->replies[i].poststat = *buf; + if (parbuf) + local->replies[i].postparent = *parbuf; + if (xdata) { + local->replies[i].xdata = dict_ref(xdata); + ret = dict_get_int8(xdata, "link-count", &need_heal); + } + + local->replies[i].need_heal = need_heal; + syncbarrier_wake(&local->barrier); + + return 0; } - -int -afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) +inode_t * +afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, + unsigned char *lookup_on, dict_t *xattr) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; + loc_t loc = { + 0, + }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + inode_t *inode = NULL; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + local = frame->local; + priv = frame->this->private; -// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); - memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count); + xattr_req = dict_new(); + if (!xattr_req) + return NULL; - for (i = 0; i < priv->child_count; i++) { - sh->locked_nodes[i] = 0; - } + if (xattr) + dict_copy(xattr, xattr_req); - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } + if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) { + dict_unref(xattr_req); + return NULL; + } - if (local->govinda_gOvinda) { - gf_log (this->name, GF_LOG_TRACE, - "aborting selfheal of %s", - local->loc.path); - sh->completion_cbk (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); - afr_self_heal_metadata (frame, this); - } + inode = inode_new(parent->table); + if (!inode) { + dict_unref(xattr_req); + return NULL; + } - return 0; -} + loc.parent = inode_ref(parent); + gf_uuid_copy(loc.pargfid, parent->gfid); + loc.name = name; + loc.inode = inode_ref(inode); + + AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); + + afr_replies_copy(replies, local->replies, priv->child_count); + + loc_wipe(&loc); + dict_unref(xattr_req); + return inode; +} static int -sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) +afr_set_multi_dom_lock_count_request(xlator_t *this, dict_t *dict) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + int ret = 0; + afr_private_t *priv = NULL; + char *key1 = NULL; + char *key2 = NULL; + + priv = this->private; + key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(this->name)); + key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(priv->sh_domain)); + + ret = dict_set_uint32(dict, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS, 1); + if (ret) + return ret; - local = frame->local; - int_lock = &local->internal_lock; + sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name); + ret = dict_set_uint32(dict, key1, 1); + if (ret) + return ret; - int_lock->lock_cbk = afr_sh_missing_entries_done; - afr_unlock (frame, this); + sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain); + ret = dict_set_uint32(dict, key2, 1); + if (ret) + return ret; - return 0; + return 0; } - -static int -sh_destroy_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int op_errno, - struct iatt *preop, struct iatt *postop) +int +afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies, + unsigned char *discover_on, dict_t *dict) { - afr_local_t *local = NULL; + loc_t loc = { + 0, + }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - loc_t *parent_loc = cookie; + local = frame->local; + priv = frame->this->private; - int call_count = 0; + xattr_req = dict_new(); + if (!xattr_req) + return -ENOMEM; + if (dict) + dict_copy(dict, xattr_req); - local = frame->local; + if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) { + dict_unref(xattr_req); + return -ENOMEM; + } - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "setattr on %s failed: %s", - local->loc.path, strerror (op_errno)); - } + if (afr_set_multi_dom_lock_count_request(frame->this, xattr_req)) { + dict_unref(xattr_req); + return -1; + } - if (parent_loc) { - loc_wipe (parent_loc); - GF_FREE (parent_loc); - } + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, gfid); - call_count = afr_frame_return (frame); + AFR_ONLIST(discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); - if (call_count == 0) { - STACK_DESTROY (frame->root); - } + afr_replies_copy(replies, local->replies, priv->child_count); - return 0; -} + loc_wipe(&loc); + dict_unref(xattr_req); + return 0; +} -static int -sh_missing_entries_newentry_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) +int +afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid, + struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - call_frame_t *setattr_frame = NULL; - int call_count = 0; - int child_index = 0; + afr_local_t *local = NULL; + dict_t *dict = NULL; - loc_t *parent_loc = NULL; + local = frame->local; - struct iatt stbuf; - int32_t valid; + if (local->xattr_req) + dict = local->xattr_req; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies, + local->child_up, dict); +} - child_index = (long) cookie; +unsigned int +afr_success_count(struct afr_reply *replies, unsigned int count) +{ + int i = 0; + unsigned int success = 0; - stbuf.ia_atime = sh->buf[sh->source].ia_atime; - stbuf.ia_atime_nsec = sh->buf[sh->source].ia_atime_nsec; - stbuf.ia_mtime = sh->buf[sh->source].ia_mtime; - stbuf.ia_mtime_nsec = sh->buf[sh->source].ia_mtime_nsec; + for (i = 0; i < count; i++) + if (replies[i].valid && replies[i].op_ret == 0) + success++; + return success; +} - stbuf.ia_uid = sh->buf[sh->source].ia_uid; - stbuf.ia_gid = sh->buf[sh->source].ia_gid; +int +afr_selfheal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + int i = 0; - valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + local = frame->local; + i = (long)cookie; - if (op_ret == 0) { - setattr_frame = copy_frame (frame); + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; - setattr_frame->local = GF_CALLOC (1, sizeof (afr_local_t), - gf_afr_mt_afr_local_t); + syncbarrier_wake(&local->barrier); - ((afr_local_t *)setattr_frame->local)->call_count = 2; + return 0; +} - gf_log (this->name, GF_LOG_TRACE, - "setattr (%s) on subvolume %s", - local->loc.path, priv->children[child_index]->name); +int +afr_locked_fill(call_frame_t *frame, xlator_t *this, unsigned char *locked_on) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int count = 0; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].op_ret == 0) { + locked_on[i] = 1; + count++; + } else { + locked_on[i] = 0; + } + } - STACK_WIND_COOKIE (setattr_frame, sh_destroy_cbk, - (void *) (long) 0, - priv->children[child_index], - priv->children[child_index]->fops->setattr, - &local->loc, &stbuf, valid); + return count; +} - valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - parent_loc = GF_CALLOC (1, sizeof (*parent_loc), - gf_afr_mt_loc_t); - afr_build_parent_loc (parent_loc, &local->loc); +int +afr_selfheal_tryinodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) +{ + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; - STACK_WIND_COOKIE (setattr_frame, sh_destroy_cbk, - (void *) (long) parent_loc, - priv->children[child_index], - priv->children[child_index]->fops->setattr, - parent_loc, &sh->parentbuf, valid); - } + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - call_count = afr_frame_return (frame); + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - if (call_count == 0) { - sh_missing_entries_finish (frame, this); - } + AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock, + NULL); - return 0; -} + loc_wipe(&loc); + return afr_locked_fill(frame, this, locked_on); +} -static int -sh_missing_entries_mknod (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int ret = 0; - int enoent_count = 0; - int call_count = 0; - mode_t st_mode = 0; - dev_t ia_dev = 0; - dict_t *dict = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) - if (sh->child_errno[i] == ENOENT) - enoent_count++; - - call_count = enoent_count; - local->call_count = call_count; - - st_mode = st_mode_from_ia (sh->buf[sh->source].ia_prot, - sh->buf[sh->source].ia_type); - ia_dev = sh->buf[sh->source].ia_dev; - - gf_log (this->name, GF_LOG_TRACE, - "mknod %s mode 0%o on %d subvolumes", - local->loc.path, st_mode, enoent_count); - - dict = dict_new (); - if (!dict) - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - - ret = afr_set_dict_gfid (dict, sh->buf[sh->source].ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, "gfid set failed"); +int +afr_selfheal_inodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) +{ + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; + + AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock, + NULL); + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_locked_fill(frame, this, locked_on); + afr_selfheal_uninodelk(frame, this, inode, dom, off, size, + locked_on); + + AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW, + &flock, NULL); + break; + } + } - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] == ENOENT) { - STACK_WIND_COOKIE (frame, - sh_missing_entries_newentry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mknod, - &local->loc, st_mode, ia_dev, dict); - if (!--call_count) - break; - } - } + loc_wipe(&loc); - if (dict) - dict_unref (dict); + return afr_locked_fill(frame, this, locked_on); +} - return 0; +static void +afr_get_lock_and_eagain_counts(afr_private_t *priv, struct afr_reply *replies, + int *lock_count, int *eagain_count) +{ + int i = 0; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + if (replies[i].op_ret == 0) { + (*lock_count)++; + } else if (replies[i].op_ret == -1 && replies[i].op_errno == EAGAIN) { + (*eagain_count)++; + } + } } +/*Do blocking locks if number of locks acquired is majority and there were some + * EAGAINs. Useful for odd-way replication*/ +int +afr_selfheal_tie_breaker_inodelk(call_frame_t *frame, xlator_t *this, + inode_t *inode, char *dom, off_t off, + size_t size, unsigned char *locked_on) +{ + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int lock_count = 0; + int eagain_count = 0; -static int -sh_missing_entries_mkdir (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - dict_t *dict = NULL; - int i = 0; - int ret = 0; - int enoent_count = 0; - int call_count = 0; - mode_t st_mode = 0; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) - if (sh->child_errno[i] == ENOENT) - enoent_count++; - - call_count = enoent_count; - local->call_count = call_count; - - st_mode = st_mode_from_ia (sh->buf[sh->source].ia_prot, - sh->buf[sh->source].ia_type); - - dict = dict_new (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - sh_missing_entries_finish (frame, this); - return 0; - } + priv = this->private; + local = frame->local; - ret = afr_set_dict_gfid (dict, sh->buf[sh->source].ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "inode gfid set failed"); + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - gf_log (this->name, GF_LOG_TRACE, - "mkdir %s mode 0%o on %d subvolumes", - local->loc.path, st_mode, enoent_count); + AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock, + NULL); - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] == ENOENT) { - if (!strcmp (local->loc.path, "/")) { - /* We shouldn't try to create "/" */ + afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count, + &eagain_count); - sh_missing_entries_finish (frame, this); + if (lock_count > priv->child_count / 2 && eagain_count) { + afr_locked_fill(frame, this, locked_on); + afr_selfheal_uninodelk(frame, this, inode, dom, off, size, locked_on); - return 0; - } else { - STACK_WIND_COOKIE (frame, - sh_missing_entries_newentry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mkdir, - &local->loc, st_mode, dict); - if (!--call_count) - break; - } - } - } + AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW, + &flock, NULL); + } - if (dict) - dict_unref (dict); + loc_wipe(&loc); - return 0; + return afr_locked_fill(frame, this, locked_on); } +int +afr_selfheal_uninodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + const unsigned char *locked_on) +{ + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; -static int -sh_missing_entries_symlink (call_frame_t *frame, xlator_t *this, - const char *link, struct iatt *buf) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - dict_t *dict = NULL; - int i = 0; - int ret = 0; - int enoent_count = 0; - int call_count = 0; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) - if (sh->child_errno[i] == ENOENT) - enoent_count++; - - call_count = enoent_count; - local->call_count = call_count; - - dict = dict_new (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - sh_missing_entries_finish (frame, this); - return 0; - } + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - ret = afr_set_dict_gfid (dict, buf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "dict gfid set failed"); + flock.l_type = F_UNLCK; + flock.l_start = off; + flock.l_len = size; - gf_log (this->name, GF_LOG_TRACE, - "symlink %s -> %s on %d subvolumes", - local->loc.path, link, enoent_count); + AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, + F_SETLK, &flock, NULL); - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] == ENOENT) { - STACK_WIND_COOKIE (frame, - sh_missing_entries_newentry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->symlink, - link, &local->loc, dict); - if (!--call_count) - break; - } - } + loc_wipe(&loc); - return 0; + return 0; } - -static int -sh_missing_entries_readlink_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *link, struct iatt *sbuf) +int +afr_selfheal_tryentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - if (op_ret > 0) - sh_missing_entries_symlink (frame, this, link, sbuf); - else - sh_missing_entries_finish (frame, this); + loc_t loc = { + 0, + }; - return 0; -} + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); -static int -sh_missing_entries_readlink (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + loc_wipe(&loc); + return afr_locked_fill(frame, this, locked_on); +} - local = frame->local; - sh = &local->self_heal; - priv = this->private; +int +afr_selfheal_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) +{ + loc_t loc = { + 0, + }; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_locked_fill(frame, this, locked_on); + afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on, + NULL); + + AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + break; + } + } - STACK_WIND (frame, sh_missing_entries_readlink_cbk, - priv->children[sh->source], - priv->children[sh->source]->fops->readlink, - &local->loc, 4096); + loc_wipe(&loc); - return 0; + return afr_locked_fill(frame, this, locked_on); } +int +afr_selfheal_tie_breaker_entrylk(call_frame_t *frame, xlator_t *this, + inode_t *inode, char *dom, const char *name, + unsigned char *locked_on) +{ + loc_t loc = { + 0, + }; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int lock_count = 0; + int eagain_count = 0; -static int -sh_missing_entries_create (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int type = 0; - int i = 0; - afr_private_t *priv = NULL; - int enoent_count = 0; - int govinda_gOvinda = 0; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; - - if (sh->child_errno[i]) { - if (sh->child_errno[i] == ENOENT) - enoent_count++; - } else { - if (type) { - if (type != sh->buf[i].ia_type) { - gf_log (this->name, GF_LOG_TRACE, - "file %s is govinda!", - local->loc.path); - - govinda_gOvinda = 1; - } - } else { - sh->source = i; - type = sh->buf[i].ia_type; - } - } - } - - if (govinda_gOvinda) { - gf_log (this->name, GF_LOG_ERROR, - "conflicting filetypes exist for path %s. returning.", - local->loc.path); - - local->govinda_gOvinda = 1; - sh_missing_entries_finish (frame, this); - return 0; - } - - if (!type) { - gf_log (this->name, GF_LOG_ERROR, - "no source found for %s. all nodes down?. returning.", - local->loc.path); - /* subvolumes down and/or file does not exist */ - sh_missing_entries_finish (frame, this); - return 0; - } - - if (enoent_count == 0) { - gf_log (this->name, GF_LOG_ERROR, - "no missing files - %s. proceeding to metadata check", - local->loc.path); - /* proceed to next step - metadata self-heal */ - sh_missing_entries_finish (frame, this); - return 0; - } - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - sh_missing_entries_mknod (frame, this); - break; - case IA_IFLNK: - sh_missing_entries_readlink (frame, this); - break; - case IA_IFDIR: - sh_missing_entries_mkdir (frame, this); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "unknown file type: 0%o", type); - local->govinda_gOvinda = 1; - sh_missing_entries_finish (frame, this); - } - - return 0; -} + priv = this->private; + local = frame->local; + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); -static int -sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - int child_index = 0; - afr_local_t *local = NULL; - int call_count = 0; - afr_private_t *priv = NULL; + AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); + afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count, + &eagain_count); - local = frame->local; - priv = this->private; + if (lock_count > priv->child_count / 2 && eagain_count) { + afr_locked_fill(frame, this, locked_on); + afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on, NULL); - child_index = (long) cookie; + AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + } - LOCK (&frame->lock); - { - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "path %s on subvolume %s is of mode 0%o", - local->loc.path, - priv->children[child_index]->name, - buf->ia_type); + loc_wipe(&loc); - local->self_heal.buf[child_index] = *buf; - local->self_heal.parentbuf = *postparent; - } else { - gf_log (this->name, GF_LOG_TRACE, - "path %s on subvolume %s => -1 (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); + return afr_locked_fill(frame, this, locked_on); +} - local->self_heal.child_errno[child_index] = op_errno; - } +int +afr_selfheal_unentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on, + dict_t *xdata) +{ + loc_t loc = { + 0, + }; - } - UNLOCK (&frame->lock); + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - call_count = afr_frame_return (frame); + AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, + name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata); - if (call_count == 0) { - sh_missing_entries_create (frame, this); - } + loc_wipe(&loc); - return 0; + return 0; } +gf_boolean_t +afr_is_data_set(xlator_t *this, dict_t *xdata) +{ + return afr_is_pending_set(this, xdata, AFR_DATA_TRANSACTION); +} -static int -sh_missing_entries_lookup (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_is_metadata_set(xlator_t *this, dict_t *xdata) +{ + return afr_is_pending_set(this, xdata, AFR_METADATA_TRANSACTION); +} + +gf_boolean_t +afr_is_entry_set(xlator_t *this, dict_t *xdata) { - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - afr_private_t *priv = NULL; - dict_t *xattr_req = NULL; - int ret = -1; + return afr_is_pending_set(this, xdata, AFR_ENTRY_TRANSACTION); +} - local = frame->local; - priv = this->private; +/* + * This function inspects the looked up replies (in an unlocked manner) + * and decides whether a locked verification and possible healing is + * required or not. It updates the three booleans for each type + * of healing. If the boolean flag gets set to FALSE, then we are sure + * no healing is required. If the boolean flag gets set to TRUE then + * we have to proceed with locked reinspection. + */ - call_count = afr_up_children_count (priv->child_count, - local->child_up); +int +afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, + inode_t **link_inode, gf_boolean_t *data_selfheal, + gf_boolean_t *metadata_selfheal, + gf_boolean_t *entry_selfheal, + struct afr_reply *replies_dst) +{ + afr_private_t *priv = NULL; + inode_t *inode = NULL; + int i = 0; + int valid_cnt = 0; + struct iatt first = { + 0, + }; + int first_idx = 0; + struct afr_reply *replies = NULL; + int ret = -1; + + priv = this->private; + + inode = afr_inode_find(this, gfid); + if (!inode) + goto out; + + replies = alloca0(sizeof(*replies) * priv->child_count); + + ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies); + if (ret) + goto out; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + if (replies[i].op_ret == -1) + continue; + + /* The data segment of the changelog can be non-zero to indicate + * the directory needs a full heal. So the check below ensures + * it's not a directory before setting the data_selfheal boolean. + */ + if (data_selfheal && !IA_ISDIR(replies[i].poststat.ia_type) && + afr_is_data_set(this, replies[i].xdata)) + *data_selfheal = _gf_true; - local->call_count = call_count; + if (metadata_selfheal && afr_is_metadata_set(this, replies[i].xdata)) + *metadata_selfheal = _gf_true; - xattr_req = dict_new(); + if (entry_selfheal && afr_is_entry_set(this, replies[i].xdata)) + *entry_selfheal = _gf_true; - if (xattr_req) { - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (xattr_req, - priv->pending_key[i], - 3 * sizeof(int32_t)); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - } + valid_cnt++; + if (valid_cnt == 1) { + first = replies[i].poststat; + first_idx = i; + continue; } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on subvolume %s", - local->loc.path, priv->children[i]->name); + if (!IA_EQUAL(first, replies[i].poststat, type)) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "TYPE mismatch %d vs %d on %s for gfid:%s", + (int)first.ia_type, (int)replies[i].poststat.ia_type, + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;" + "type=file;gfid=%s;" + "ia_type-%d=%s;ia_type-%d=%s", + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(replies[i].poststat.ia_gfid), first_idx, + gf_inode_type_to_str(first.ia_type), i, + gf_inode_type_to_str(replies[i].poststat.ia_type)); + ret = -EIO; + goto out; + } - STACK_WIND_COOKIE (frame, - sh_missing_entries_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &local->loc, xattr_req); + if (!IA_EQUAL(first, replies[i].poststat, uid)) { + gf_msg_debug(this->name, 0, + "UID mismatch " + "%d vs %d on %s for gfid:%s", + (int)first.ia_uid, (int)replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); - if (!--call_count) - break; - } - } + if (metadata_selfheal) + *metadata_selfheal = _gf_true; + } - if (xattr_req) - dict_unref (xattr_req); + if (!IA_EQUAL(first, replies[i].poststat, gid)) { + gf_msg_debug(this->name, 0, + "GID mismatch " + "%d vs %d on %s for gfid:%s", + (int)first.ia_uid, (int)replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); - return 0; -} + if (metadata_selfheal) + *metadata_selfheal = _gf_true; + } + if (!IA_EQUAL(first, replies[i].poststat, prot)) { + gf_msg_debug(this->name, 0, + "MODE mismatch " + "%d vs %d on %s for gfid:%s", + (int)st_mode_from_ia(first.ia_prot, 0), + (int)st_mode_from_ia(replies[i].poststat.ia_prot, 0), + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + + if (metadata_selfheal) + *metadata_selfheal = _gf_true; + } + if (IA_ISREG(first.ia_type) && + !IA_EQUAL(first, replies[i].poststat, size)) { + gf_msg_debug(this->name, 0, + "SIZE mismatch " + "%lld vs %lld on %s for gfid:%s", + (long long)first.ia_size, + (long long)replies[i].poststat.ia_size, + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + + if (data_selfheal) + *data_selfheal = _gf_true; + } + } -int -afr_sh_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) + if (valid_cnt > 0 && link_inode) { + *link_inode = inode_link(inode, NULL, NULL, &first); + if (!*link_inode) { + ret = -EINVAL; + goto out; + } + } else if (valid_cnt < 2) { + ret = afr_check_stale_error(replies, priv); + goto out; + } + + ret = 0; +out: + if (replies && replies_dst) + afr_replies_copy(replies_dst, replies, priv->child_count); + if (inode) + inode_unref(inode); + if (replies) + afr_replies_wipe(replies, priv->child_count); + + return ret; +} + +inode_t * +afr_inode_find(xlator_t *this, uuid_t gfid) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + inode_table_t *table = NULL; + inode_t *inode = NULL; - local = frame->local; - int_lock = &local->internal_lock; + table = this->itable; + if (!table) + return NULL; - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks failed."); - afr_sh_missing_entries_done (frame, this); - } else { + inode = inode_find(table, gfid); + if (inode) + return inode; - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks done. Proceeding to FOP"); - sh_missing_entries_lookup (frame, this); - } + inode = inode_new(table); + if (!inode) + return NULL; - return 0; + gf_uuid_copy(inode->gfid, gfid); + + return inode; } -static int -afr_sh_entrylk (call_frame_t *frame, xlator_t *this) +call_frame_t * +afr_frame_create(xlator_t *this, int32_t *op_errno) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + pid_t pid = GF_CLIENT_PID_SELF_HEALD; - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; + frame = create_frame(this, this->ctx->pool); + if (!frame) { + if (op_errno) + *op_errno = ENOMEM; + return NULL; + } - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_ENTRY_SELF_HEAL_LK; + local = AFR_FRAME_INIT(frame, (*op_errno)); + if (!local) { + STACK_DESTROY(frame->root); + return NULL; + } - afr_set_lock_number (frame, this); + syncopctx_setfspid(&pid); - int_lock->lk_basename = local->loc.name; - int_lock->lk_loc = &sh->parent_loc; - int_lock->lock_cbk = afr_sh_post_nonblocking_entrylk_cbk; + frame->root->pid = pid; - afr_nonblocking_entrylk (frame, this); + afr_set_lk_owner(frame, this, frame->root); - return 0; + return frame; } -static int -afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_newentry_mark(call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, struct afr_reply *replies, + unsigned char *sources, unsigned char *newentry) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + int ret = 0; + int i = 0; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int **changelog = NULL; - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - priv = this->private; + priv = this->private; - gf_log (this->name, GF_LOG_TRACE, - "attempting to recreate missing entries for path=%s", - local->loc.path); + gf_uuid_copy(inode->gfid, replies[source].poststat.ia_gfid); - afr_build_parent_loc (&sh->parent_loc, &local->loc); + xattr = dict_new(); + if (!xattr) + return -ENOMEM; - afr_sh_entrylk (frame, this); - return 0; -} + changelog = afr_mark_pending_changelog(priv, newentry, xattr, + replies[source].poststat.ia_type); -afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *lc = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *shc = NULL; + if (!changelog) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + ret |= afr_selfheal_post_op(frame, this, inode, i, xattr, NULL); + } +out: + if (changelog) + afr_matrix_cleanup(changelog, priv->child_count); + if (xattr) + dict_unref(xattr); + return ret; +} - priv = this->private; +int +afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid) +{ + int ret = -1; + int entry_ret = 1; + int metadata_ret = 1; + int data_ret = 1; + int or_ret = 0; + inode_t *inode = NULL; + fd_t *fd = NULL; + gf_boolean_t data_selfheal = _gf_false; + gf_boolean_t metadata_selfheal = _gf_false; + gf_boolean_t entry_selfheal = _gf_false; + afr_private_t *priv = NULL; + + priv = this->private; + + ret = afr_selfheal_unlocked_inspect(frame, this, gfid, &inode, + &data_selfheal, &metadata_selfheal, + &entry_selfheal, NULL); + if (ret) + goto out; + + if (!(data_selfheal || metadata_selfheal || entry_selfheal)) { + ret = 2; + goto out; + } + + if (inode->ia_type == IA_IFREG) { + ret = afr_selfheal_data_open(this, inode, &fd); + if (!fd) { + ret = -EIO; + goto out; + } + } - sh = &l->self_heal; + if (data_selfheal && priv->data_self_heal) + data_ret = afr_selfheal_data(frame, this, fd); - lc = GF_CALLOC (1, sizeof (afr_local_t), - gf_afr_mt_afr_local_t); + if (metadata_selfheal && priv->metadata_self_heal) + metadata_ret = afr_selfheal_metadata(frame, this, inode); - shc = &lc->self_heal; + if (entry_selfheal && priv->entry_self_heal) + entry_ret = afr_selfheal_entry(frame, this, inode); - shc->unwind = sh->unwind; - shc->need_data_self_heal = sh->need_data_self_heal; - shc->need_metadata_self_heal = sh->need_metadata_self_heal; - shc->need_entry_self_heal = sh->need_entry_self_heal; - shc->forced_merge = sh->forced_merge; - shc->healing_fd_opened = sh->healing_fd_opened; - shc->data_lock_held = sh->data_lock_held; - if (sh->healing_fd && !sh->healing_fd_opened) - shc->healing_fd = fd_ref (sh->healing_fd); - else - shc->healing_fd = sh->healing_fd; - shc->background = sh->background; - shc->type = sh->type; - - if (l->loc.path) - loc_copy (&lc->loc, &l->loc); - - lc->child_up = memdup (l->child_up, priv->child_count); - if (l->xattr_req) - lc->xattr_req = dict_ref (l->xattr_req); - - if (l->cont.lookup.inode) - lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode); - if (l->cont.lookup.xattr) - lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr); - if (l->internal_lock.inode_locked_nodes) - lc->internal_lock.inode_locked_nodes = - memdup (l->internal_lock.inode_locked_nodes, - priv->child_count); - else - lc->internal_lock.inode_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes), - priv->child_count, - gf_afr_mt_char); - if (l->internal_lock.entry_locked_nodes) - lc->internal_lock.entry_locked_nodes = - memdup (l->internal_lock.entry_locked_nodes, - priv->child_count); - else - lc->internal_lock.entry_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.entry_locked_nodes), - priv->child_count, - gf_afr_mt_char); - if (l->internal_lock.locked_nodes) - lc->internal_lock.locked_nodes = - memdup (l->internal_lock.locked_nodes, - priv->child_count); - else - lc->internal_lock.locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), - priv->child_count, - gf_afr_mt_char); + or_ret = (data_ret | metadata_ret | entry_ret); - lc->internal_lock.inodelk_lock_count = - l->internal_lock.inodelk_lock_count; - lc->internal_lock.entrylk_lock_count = - l->internal_lock.entrylk_lock_count; + if (data_ret == -EIO || metadata_ret == -EIO || entry_ret == -EIO) + ret = -EIO; + else if (data_ret == 1 && metadata_ret == 1 && entry_ret == 1) + ret = 1; + else if (or_ret < 0) + ret = or_ret; + else + ret = 0; - return lc; +out: + if (inode) + inode_unref(inode); + if (fd) + fd_unref(fd); + return ret; } +/* + * This is the entry point for healing a given GFID. The return values for this + * function are as follows: + * '0' if the self-heal is successful + * '1' if the afr-xattrs are non-zero (due to on-going IO) and no heal is needed + * '2' if the afr-xattrs are all-zero and no heal is needed + * $errno if the heal on the gfid failed. + */ int -afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) +afr_selfheal(xlator_t *this, uuid_t gfid) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - char sh_type_str[256] = {0,}; + int ret = -1; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; - priv = this->private; - local = bgsh_frame->local; - sh = &local->self_heal; + frame = afr_frame_create(this, NULL); + if (!frame) + return ret; - if (local->govinda_gOvinda) { - afr_set_split_brain (this, local->cont.lookup.inode, - _gf_true); - } else { - afr_set_split_brain (this, local->cont.lookup.inode, - _gf_false); - } + local = frame->local; + local->xdata_req = dict_new(); - afr_self_heal_type_str_get(sh, sh_type_str, - sizeof(sh_type_str)); - gf_log (this->name, GF_LOG_NORMAL, - "background %s self-heal completed on %s", sh_type_str, - local->loc.path); - FRAME_SU_UNDO (bgsh_frame, afr_local_t); + ret = afr_selfheal_do(frame, this, gfid); - if (!sh->unwound) { - sh->unwind (sh->orig_frame, this); - } + if (frame) + AFR_STACK_DESTROY(frame); - if (sh->background) { - LOCK (&priv->lock); - { - priv->background_self_heals_started--; - } - UNLOCK (&priv->lock); - } + return ret; +} - AFR_STACK_DESTROY (bgsh_frame); +afr_local_t * +__afr_dequeue_heals(afr_private_t *priv) +{ + afr_local_t *local = NULL; + + if (list_empty(&priv->heal_waiting)) + goto none; + if ((priv->background_self_heal_count > 0) && + (priv->healers >= priv->background_self_heal_count)) + goto none; + + local = list_entry(priv->heal_waiting.next, afr_local_t, healer); + priv->heal_waiters--; + GF_ASSERT(priv->heal_waiters >= 0); + list_del_init(&local->healer); + list_add(&local->healer, &priv->healing); + priv->healers++; + return local; +none: + gf_msg_debug(THIS->name, 0, + "Nothing dequeued. " + "Num healers: %d, Num Waiters: %d", + priv->healers, priv->heal_waiters); + return NULL; +} + +int +afr_refresh_selfheal_wrap(void *opaque) +{ + call_frame_t *heal_frame = opaque; + afr_local_t *local = heal_frame->local; + int ret = 0; - return 0; + ret = afr_selfheal(heal_frame->this, local->refreshinode->gfid); + return ret; } int -afr_self_heal (call_frame_t *frame, xlator_t *this) +afr_refresh_heal_done(int ret, call_frame_t *frame, void *opaque) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; + call_frame_t *heal_frame = opaque; + xlator_t *this = heal_frame->this; + afr_private_t *priv = this->private; + afr_local_t *local = heal_frame->local; + + LOCK(&priv->lock); + { + list_del_init(&local->healer); + priv->healers--; + GF_ASSERT(priv->healers >= 0); + local = __afr_dequeue_heals(priv); + } + UNLOCK(&priv->lock); + + AFR_STACK_DESTROY(heal_frame); + + if (local) + afr_heal_synctask(this, local); + return 0; +} - call_frame_t *sh_frame = NULL; - afr_local_t *sh_local = NULL; +void +afr_heal_synctask(xlator_t *this, afr_local_t *local) +{ + int ret = 0; + call_frame_t *heal_frame = NULL; + + heal_frame = local->heal_frame; + ret = synctask_new(this->ctx->env, afr_refresh_selfheal_wrap, + afr_refresh_heal_done, heal_frame, heal_frame); + if (ret < 0) + /* Heal not launched. Will be queued when the next inode + * refresh happens and shd hasn't healed it yet. */ + afr_refresh_heal_done(ret, heal_frame, heal_frame); +} +gf_boolean_t +afr_throttled_selfheal(call_frame_t *frame, xlator_t *this) +{ + gf_boolean_t can_heal = _gf_true; + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + + LOCK(&priv->lock); + { + if ((priv->background_self_heal_count > 0) && + (priv->heal_wait_qlen + priv->background_self_heal_count) > + (priv->heal_waiters + priv->healers)) { + list_add_tail(&local->healer, &priv->heal_waiting); + priv->heal_waiters++; + local = __afr_dequeue_heals(priv); + } else { + can_heal = _gf_false; + } + } + UNLOCK(&priv->lock); - local = frame->local; - priv = this->private; + if (can_heal) { + if (local) + afr_heal_synctask(this, local); + else + gf_msg_debug(this->name, 0, + "Max number of heals are " + "pending, background self-heal rejected."); + } - GF_ASSERT (local->loc.path); + return can_heal; +} - afr_set_lk_owner (frame, this); +int +afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources, + afr_transaction_type type) +{ + int source = -1; + int i = 0; - if (local->self_heal.background) { - LOCK (&priv->lock); - { - if (priv->background_self_heals_started - > priv->background_self_heal_count) { + /* Give preference to local child to save on bandwidth */ + for (i = 0; i < priv->child_count; i++) { + if (priv->local[i] && sources[i]) { + if ((type == AFR_DATA_TRANSACTION) && AFR_IS_ARBITER_BRICK(priv, i)) + continue; - local->self_heal.background = _gf_false; + source = i; + goto out; + } + } - } else { - priv->background_self_heals_started++; - } - } - UNLOCK (&priv->lock); + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + goto out; } + } +out: + return source; +} - gf_log (this->name, GF_LOG_TRACE, - "performing self heal on %s (metadata=%d data=%d entry=%d)", - local->loc.path, - local->self_heal.need_metadata_self_heal, - local->self_heal.need_data_self_heal, - local->self_heal.need_entry_self_heal); - - sh_frame = copy_frame (frame); - sh_local = afr_local_copy (local, this); - sh_frame->local = sh_local; - sh = &sh_local->self_heal; - - sh->orig_frame = frame; - - sh->completion_cbk = afr_self_heal_completion_cbk; - - - sh->buf = GF_CALLOC (priv->child_count, sizeof (struct iatt), - gf_afr_mt_iatt); - sh->child_errno = GF_CALLOC (priv->child_count, sizeof (int), - gf_afr_mt_int); - sh->success = GF_CALLOC (priv->child_count, sizeof (int), - gf_afr_mt_int); - sh->xattr = GF_CALLOC (priv->child_count, sizeof (dict_t *), - gf_afr_mt_dict_t); - sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count, - gf_afr_mt_int); - sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes), - priv->child_count, - gf_afr_mt_int); - - sh->pending_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); - - for (i = 0; i < priv->child_count; i++) { - sh->pending_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); - } - - sh->delta_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); - for (i = 0; i < priv->child_count; i++) { - sh->delta_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); - } - - FRAME_SU_DO (sh_frame, afr_local_t); - if (local->success_count && local->enoent_count) { - afr_self_heal_missing_entries (sh_frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); - - afr_sh_missing_entries_done (sh_frame, this); - } - - return 0; +static int +afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + afr_local_t *local = frame->local; + int i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret == 0) { + local->op_ret = 0; + local->replies[i].poststat = *buf; + local->replies[i].preparent = *preparent; + local->replies[i].postparent = *postparent; + } + if (xdata) { + local->replies[i].xdata = dict_ref(xdata); + } + + syncbarrier_wake(&local->barrier); + return 0; } -void -afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, - size_t size) +int +afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode) { - GF_ASSERT (str && (size > 0)); + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = this->private; + unsigned char *mkdir_on = alloca0(priv->child_count); + unsigned char *lookup_on = alloca0(priv->child_count); + loc_t loc = {0}; + int32_t op_errno = 0; + int32_t child_op_errno = 0; + struct iatt iatt = {0}; + dict_t *xdata = NULL; + uuid_t anon_inode_gfid = {0}; + int mkdir_count = 0; + int i = 0; + + /*Try to mkdir everywhere and return success if the dir exists on 'child' + */ + + if (!priv->use_anon_inode) { + op_errno = EINVAL; + goto out; + } + + frame = afr_frame_create(this, &op_errno); + if (op_errno) { + goto out; + } + local = frame->local; + if (!local->child_up[child]) { + /*Other bricks may need mkdir so don't error out yet*/ + child_op_errno = ENOTCONN; + } + gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid); + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + + if (priv->anon_inode[i]) { + mkdir_on[i] = 0; + } else { + mkdir_on[i] = 1; + mkdir_count++; + } + } - if (self_heal_p->need_metadata_self_heal) { - snprintf(str, size, " meta-data"); + if (mkdir_count == 0) { + *linked_inode = inode_find(this->itable, anon_inode_gfid); + if (*linked_inode) { + op_errno = 0; + goto out; + } + } + + loc.parent = inode_ref(this->itable->root); + loc.name = priv->anon_inode_name; + loc.inode = inode_new(this->itable); + if (!loc.inode) { + op_errno = ENOMEM; + goto out; + } + + xdata = dict_new(); + if (!xdata) { + op_errno = ENOMEM; + goto out; + } + + op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true); + if (op_errno) { + goto out; + } + + if (mkdir_count == 0) { + memcpy(lookup_on, local->child_up, priv->child_count); + goto lookup; + } + + AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0, + xdata); + + for (i = 0; i < priv->child_count; i++) { + if (!mkdir_on[i]) { + continue; } - if (self_heal_p->need_data_self_heal) { - snprintf(str + strlen(str), size - strlen(str), - " data"); + if (local->replies[i].op_ret == 0) { + priv->anon_inode[i] = 1; + iatt = local->replies[i].poststat; + } else if (local->replies[i].op_ret < 0 && + local->replies[i].op_errno == EEXIST) { + lookup_on[i] = 1; + } else if (i == child) { + child_op_errno = local->replies[i].op_errno; + } + } + + if (AFR_COUNT(lookup_on, priv->child_count) == 0) { + goto link; + } + +lookup: + AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xdata); + for (i = 0; i < priv->child_count; i++) { + if (!lookup_on[i]) { + continue; } - if (self_heal_p->need_entry_self_heal) { - snprintf(str + strlen(str), size - strlen(str), - " entry"); + if (local->replies[i].op_ret == 0) { + if (gf_uuid_compare(anon_inode_gfid, + local->replies[i].poststat.ia_gfid) == 0) { + priv->anon_inode[i] = 1; + iatt = local->replies[i].poststat; + } else { + if (i == child) + child_op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA, + "%s has gfid: %s", priv->anon_inode_name, + uuid_utoa(local->replies[i].poststat.ia_gfid)); + } + } else if (i == child) { + child_op_errno = local->replies[i].op_errno; + } + } +link: + if (!gf_uuid_is_null(iatt.ia_gfid)) { + *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt); + if (*linked_inode) { + op_errno = 0; + inode_lookup(*linked_inode); + } else { + op_errno = ENOMEM; } + goto out; + } + +out: + if (xdata) + dict_unref(xdata); + loc_wipe(&loc); + /*child_op_errno takes precedence*/ + if (child_op_errno == 0) { + child_op_errno = op_errno; + } + + if (child_op_errno && *linked_inode) { + inode_unref(*linked_inode); + *linked_inode = NULL; + } + if (frame) + AFR_STACK_DESTROY(frame); + return -child_op_errno; } diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h deleted file mode 100644 index 6431feaff35..00000000000 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef __AFR_SELF_HEAL_COMMON_H__ -#define __AFR_SELF_HEAL_COMMON_H__ - -#define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512)) - -typedef enum { - AFR_SELF_HEAL_ENTRY, - AFR_SELF_HEAL_METADATA, - AFR_SELF_HEAL_DATA, -} afr_self_heal_type; - -int -afr_sh_select_source (int sources[], int child_count); - -int -afr_sh_sink_count (int sources[], int child_count); - -int -afr_sh_source_count (int sources[], int child_count); - -int -afr_sh_supress_errenous_children (int sources[], int child_errno[], - int child_count); - -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); - -void -afr_sh_build_pending_matrix (afr_private_t *priv, - int32_t *pending_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type); - -void -afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], int success[], - int child_count, afr_transaction_type type); - -int -afr_sh_mark_sources (afr_self_heal_t *sh, int child_count, - afr_self_heal_type type); - -int -afr_sh_delta_to_xattr (afr_private_t *priv, - int32_t *delta_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type); - -int -afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count); - -void -afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, - size_t size); - -#endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 0fd8dae69f9..37bcc2b3f9e 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -1,1113 +1,891 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ -#include "glusterfs.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "afr-self-heal-algorithm.h" - +#include <glusterfs/byte-order.h> +#include "protocol-common.h" +#include "afr-messages.h" +#include <glusterfs/events.h> -int -afr_sh_data_done (call_frame_t *frame, xlator_t *this) +#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size)) +static int +__checksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, uint32_t weak, uint8_t *strong, dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - /* - TODO: cleanup sh->* - */ - - if (sh->healing_fd && !sh->healing_fd_opened) { - /* unref only if we created the fd ourselves */ - - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; + afr_local_t *local = NULL; + struct afr_reply *replies = NULL; + int i = (long)cookie; + + local = frame->local; + replies = local->replies; + + replies[i].valid = 1; + replies[i].op_ret = op_ret; + replies[i].op_errno = op_errno; + if (xdata) { + replies[i].buf_has_zeroes = dict_get_str_boolean( + xdata, "buf-has-zeroes", _gf_false); + replies[i].fips_mode_rchecksum = dict_get_str_boolean( + xdata, "fips-mode-rchecksum", _gf_false); + } + if (strong) { + if (replies[i].fips_mode_rchecksum) { + memcpy(local->replies[i].checksum, strong, SHA256_DIGEST_LENGTH); + } else { + memcpy(local->replies[i].checksum, strong, MD5_DIGEST_LENGTH); } + } -/* for (i = 0; i < priv->child_count; i++) */ -/* sh->locked_nodes[i] = 0; */ - - gf_log (this->name, GF_LOG_TRACE, - "self heal of %s completed", - local->loc.path); - - sh->completion_cbk (frame, this); - - return 0; + syncbarrier_wake(&local->barrier); + return 0; } - -int -afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +static gf_boolean_t +__afr_can_skip_data_block_heal(call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + off_t offset, size_t size, struct iatt *poststat) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "flush or setattr failed on %s on subvolume %s: %s", - local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + unsigned char *wind_subvols = NULL; + gf_boolean_t checksum_match = _gf_true; + struct afr_reply *replies = NULL; + dict_t *xdata = NULL; + int i = 0; + + priv = this->private; + local = frame->local; + replies = local->replies; + + xdata = dict_new(); + if (!xdata) + goto out; + if (dict_set_int32_sizen(xdata, "check-zero-filled", 1)) { + dict_unref(xdata); + goto out; + } + + wind_subvols = alloca0(priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (i == source || healed_sinks[i]) + wind_subvols[i] = 1; + } + + AFR_ONLIST(wind_subvols, frame, __checksum_cbk, rchecksum, fd, offset, size, + xdata); + if (xdata) + dict_unref(xdata); + + if (!replies[source].valid || replies[source].op_ret != 0) + return _gf_false; + + for (i = 0; i < priv->child_count; i++) { + if (i == source) + continue; + if (replies[i].valid) { + if (memcmp(replies[source].checksum, replies[i].checksum, + replies[source].fips_mode_rchecksum + ? SHA256_DIGEST_LENGTH + : MD5_DIGEST_LENGTH)) { + checksum_match = _gf_false; + break; + } + } + } - if (call_count == 0) { - afr_sh_data_done (frame, this); - } + if (checksum_match) { + if (HAS_HOLES(poststat)) + return _gf_true; - return 0; + /* For non-sparse files, we might be better off writing the + * zeroes to sinks to avoid mismatch of disk-usage in bricks. */ + if (local->replies[source].buf_has_zeroes) + return _gf_false; + else + return _gf_true; + } +out: + return _gf_false; } - -int -afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *statpre, struct iatt *statpost) +static gf_boolean_t +__afr_is_sink_zero_filled(xlator_t *this, fd_t *fd, size_t size, off_t offset, + int sink) { - afr_sh_data_flush_cbk (frame, cookie, this, op_ret, op_errno); - - return 0; + afr_private_t *priv = NULL; + struct iobref *iobref = NULL; + struct iovec *iovec = NULL; + int count = 0; + int ret = 0; + gf_boolean_t zero_filled = _gf_false; + + priv = this->private; + ret = syncop_readv(priv->children[sink], fd, size, offset, 0, &iovec, + &count, &iobref, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = iov_0filled(iovec, count); + if (!ret) + zero_filled = _gf_true; +out: + if (iovec) + GF_FREE(iovec); + if (iobref) + iobref_unref(iobref); + return zero_filled; } - -int -afr_sh_data_close (call_frame_t *frame, xlator_t *this) +static int +__afr_selfheal_data_read_write(call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + off_t offset, size_t size, + struct afr_reply *replies, int type) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - - int i = 0; - int call_count = 0; - int source = 0; - int32_t valid = 0; - - struct iatt stbuf = {0,}; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - valid |= (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); - - stbuf.ia_atime = sh->buf[source].ia_atime; - stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; - stbuf.ia_mtime = sh->buf[source].ia_mtime; - stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; - - if (sh->healing_fd_opened) { - /* not our job to close the fd */ + struct iovec *iovec = NULL; + int count = 0; + struct iobref *iobref = NULL; + int ret = 0; + int i = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + ret = syncop_readv(priv->children[source], fd, size, offset, 0, &iovec, + &count, &iobref, NULL, NULL, NULL); + if (ret <= 0) + return ret; - afr_sh_data_done (frame, this); - return 0; + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + + /* + * TODO: Use fiemap() and discard() to heal holes + * in the future. + * + * For now, + * + * - if the source had any holes at all, + * AND + * - if we are writing past the original file size + * of the sink + * AND + * - is NOT the last block of the source file. if + * the block contains EOF, it has to be written + * in order to set the file size even if the + * last block is 0-filled. + * AND + * - if the read buffer is filled with only 0's + * + * then, skip writing to this source. We don't depend + * on the write to happen to update the size as we + * have performed an ftruncate() upfront anyways. + */ +#define is_last_block(o, b, s) ((s >= o) && (s <= (o + b))) + if (HAS_HOLES((&replies[source].poststat)) && + offset >= replies[i].poststat.ia_size && + !is_last_block(offset, size, replies[source].poststat.ia_size) && + (iov_0filled(iovec, count) == 0)) + continue; + + /* Avoid filling up sparse regions of the sink with 0-filled + * writes.*/ + if (type == AFR_SELFHEAL_DATA_FULL && + HAS_HOLES((&replies[source].poststat)) && + ((offset + size) <= replies[i].poststat.ia_size) && + (iov_0filled(iovec, count) == 0) && + __afr_is_sink_zero_filled(this, fd, size, offset, i)) { + continue; } - if (!sh->healing_fd) { - afr_sh_data_done (frame, this); - return 0; - } - - call_count = (sh->active_sinks + 1) * 2; - local->call_count = call_count; - - /* closed source */ - gf_log (this->name, GF_LOG_TRACE, - "closing fd of %s on %s", - local->loc.path, priv->children[sh->source]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, - (void *) (long) sh->source, - priv->children[sh->source], - priv->children[sh->source]->fops->flush, - sh->healing_fd); - call_count--; - - STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, - (void *) (long) sh->source, - priv->children[sh->source], - priv->children[sh->source]->fops->setattr, - &local->loc, &stbuf, valid); - - call_count--; - - if (call_count == 0) - return 0; - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "closing fd of %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - sh->healing_fd); - - call_count--; - - STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, &stbuf, valid); - - if (!--call_count) - break; - } - - return 0; -} - + ret = syncop_writev(priv->children[i], fd, iovec, count, offset, iobref, + 0, NULL, NULL, NULL, NULL); + if (ret != iov_length(iovec, count)) { + /* write() failed on this sink. unset the corresponding + member in sinks[] (which is healed_sinks[] in the + caller) so that this server does NOT get considered + as successfully healed. + */ + healed_sinks[i] = 0; + } + } + if (iovec) + GF_FREE(iovec); + if (iobref) + iobref_unref(iobref); -int -afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t * local = NULL; - int call_count = 0; - int child_index = (long) cookie; - - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "locking inode of %s on child %d failed: %s", - local->loc.path, child_index, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "inode of %s on child %d locked", - local->loc.path, child_index); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_close (frame, this); - } - - return 0; + return ret; } - -int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this) +static gf_boolean_t +afr_source_sinks_locked(xlator_t *this, unsigned char *locked_on, int source, + unsigned char *healed_sinks) { - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_self_heal_t *sh = NULL; + afr_private_t *priv = this->private; + int i = 0; - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; + if (!locked_on[source]) + return _gf_false; - GF_ASSERT (!sh->data_lock_held); + for (i = 0; i < priv->child_count; i++) { + if (healed_sinks[i] && locked_on[i]) + return _gf_true; + } - int_lock->lock_cbk = afr_sh_data_close; - afr_unlock (frame, this); - - return 0; + return _gf_false; } - -int -afr_sh_data_finish (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_data_block(call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, off_t offset, + size_t size, int type, struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - gf_log (this->name, GF_LOG_TRACE, - "finishing data selfheal of %s", local->loc.path); + int ret = -1; + afr_private_t *priv = NULL; + unsigned char *data_lock = NULL; + + priv = this->private; + data_lock = alloca0(priv->child_count); + + ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, offset, size, + data_lock); + { + if (!afr_source_sinks_locked(this, data_lock, source, healed_sinks)) { + ret = -ENOTCONN; + goto unlock; + } - if (!sh->data_lock_held) - afr_sh_data_unlock (frame, this); - else - afr_sh_data_close (frame, this); + if (type == AFR_SELFHEAL_DATA_DIFF && + __afr_can_skip_data_block_heal(frame, this, fd, source, + healed_sinks, offset, size, + &replies[source].poststat)) { + ret = 0; + goto unlock; + } - return 0; + ret = __afr_selfheal_data_read_write( + frame, this, fd, source, healed_sinks, offset, size, replies, type); + } +unlock: + afr_selfheal_uninodelk(frame, this, fd->inode, this->name, offset, size, + data_lock); + return ret; } - -int -afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) +static int +afr_selfheal_data_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *healed_sinks) { - int call_count = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; - call_count = afr_frame_return (frame); + local = frame->local; + priv = this->private; - if (call_count == 0) - afr_sh_data_finish (frame, this); - - return 0; -} + if (!priv->ensure_durability) + return 0; + AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, fsync, fd, 0, NULL); -int -afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success, - priv->child_count, AFR_DATA_TRANSACTION); - - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } - } - - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_DATA_TRANSACTION); - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - sh->healing_fd, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } - - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - GF_FREE (erase_xattr); - - return 0; + for (i = 0; i < priv->child_count; i++) + if (healed_sinks[i] && local->replies[i].op_ret != 0) + /* fsync() failed. Do NOT consider this server + as successfully healed. Mark it so. + */ + healed_sinks[i] = 0; + return 0; } - -int -afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +static int +afr_data_self_heal_type_get(afr_private_t *priv, unsigned char *healed_sinks, + int source, struct afr_reply *replies) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int call_count = 0; - int child_index = 0; - - priv = this->private; - local = frame->local; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) - gf_log (this->name, GF_LOG_DEBUG, - "ftruncate of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - else - gf_log (this->name, GF_LOG_TRACE, - "ftruncate of %s on subvolume %s completed", - local->loc.path, - priv->children[child_index]->name); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_erase_pending (frame, this); - } - - return 0; + int type = AFR_SELFHEAL_DATA_FULL; + int i = 0; + + if (priv->data_self_heal_algorithm == AFR_SELFHEAL_DATA_DYNAMIC) { + type = AFR_SELFHEAL_DATA_FULL; + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i] && i != source) + continue; + if (replies[i].poststat.ia_size) { + type = AFR_SELFHEAL_DATA_DIFF; + break; + } + } + } else { + type = priv->data_self_heal_algorithm; + } + return type; } - -int -afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_data_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source, + unsigned char *healed_sinks, struct afr_reply *replies) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - int *sources = NULL; - int call_count = 0; - int i = 0; - - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sources = sh->sources; - call_count = sh->active_sinks; - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (sources[i] || !local->child_up[i]) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ftruncate, - sh->healing_fd, sh->file_size); - - if (!--call_count) - break; - } - - return 0; -} + afr_private_t *priv = NULL; + off_t off = 0; + size_t block = 0; + int type = AFR_SELFHEAL_DATA_FULL; + int ret = -1; + call_frame_t *iter_frame = NULL; + unsigned char arbiter_sink_status = 0; + + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, + "performing data selfheal on %s", uuid_utoa(fd->inode->gfid)); + + priv = this->private; + if (priv->arbiter_count) { + arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX]; + healed_sinks[ARBITER_BRICK_INDEX] = 0; + } + + block = 128 * 1024 * priv->data_self_heal_window_size; + + type = afr_data_self_heal_type_get(priv, healed_sinks, source, replies); + + iter_frame = afr_copy_frame(frame); + if (!iter_frame) { + ret = -ENOMEM; + goto out; + } + + for (off = 0; off < replies[source].poststat.ia_size; off += block) { + if (AFR_COUNT(healed_sinks, priv->child_count) == 0) { + ret = -ENOTCONN; + goto out; + } + ret = afr_selfheal_data_block(iter_frame, this, fd, source, + healed_sinks, off, block, type, replies); + if (ret < 0) + goto out; -static struct afr_sh_algorithm * -sh_algo_from_name (xlator_t *this, char *name) -{ - int i = 0; + AFR_STACK_RESET(iter_frame); + if (iter_frame->local == NULL) { + ret = -ENOTCONN; + goto out; + } + } - while (afr_self_heal_algorithms[i].name) { - if (!strcmp (name, afr_self_heal_algorithms[i].name)) { - return &afr_self_heal_algorithms[i]; - } + ret = afr_selfheal_data_fsync(frame, this, fd, healed_sinks); - i++; - } +out: + if (arbiter_sink_status) + healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status; - return NULL; + if (iter_frame) + AFR_STACK_DESTROY(iter_frame); + return ret; } - static int -sh_zero_byte_files_exist (afr_self_heal_t *sh, int child_count) +__afr_selfheal_truncate_sinks(call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *healed_sinks, uint64_t size) { - int i; - int ret = 0; - - for (i = 0; i < child_count; i++) { - if (sh->buf[i].ia_size == 0) { - ret = 1; - break; - } - } - - return ret; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + /* This will send truncate on the arbiter brick as well if it is marked as + * sink. If changelog is enabled on the volume it captures truncate as a + * data transactions on the arbiter brick. This will help geo-rep to + * properly sync the data from master to slave if arbiter is the ACTIVE + * brick during syncing and which had got some entries healed for data as + * part of self heal. + */ + AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, ftruncate, fd, size, + NULL); + + for (i = 0; i < priv->child_count; i++) + if (healed_sinks[i] && local->replies[i].op_ret == -1) + /* truncate() failed. Do NOT consider this server + as successfully healed. Mark it so. + */ + healed_sinks[i] = 0; + + return 0; } - -struct afr_sh_algorithm * -afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_has_source_witnesses(xlator_t *this, unsigned char *sources, + uint64_t *witness) { - afr_private_t * priv = NULL; - struct afr_sh_algorithm * algo = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - algo = sh_algo_from_name (this, priv->data_self_heal_algorithm); - - if (algo == NULL) { - /* option not set, so fall back on heuristics */ - - if ((local->enoent_count != 0) - || sh_zero_byte_files_exist (sh, priv->child_count) - || (sh->file_size <= (priv->data_self_heal_window_size * this->ctx->page_size))) { - - /* - * If the file does not exist on one of the subvolumes, - * or a zero-byte file exists (created by entry self-heal) - * the entire content has to be copied anyway, so there - * is no benefit from using the "diff" algorithm. - * - * If the file size is about the same as page size, - * the entire file can be read and written with a few - * (pipelined) STACK_WINDs, which will be faster - * than "diff" which has to read checksums and then - * read and write. - */ - - algo = sh_algo_from_name (this, "full"); - - } else { - algo = sh_algo_from_name (this, "diff"); - } - } - - return algo; -} + int i = 0; + afr_private_t *priv = NULL; + priv = this->private; -int -afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int active_sinks = 0; - int source = 0; - int i = 0; - - struct afr_sh_algorithm *sh_algo = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; - } - } - sh->success[source] = 1; - - if (active_sinks == 0) { - gf_log (this->name, GF_LOG_TRACE, - "no active sinks for performing self-heal on file %s", - local->loc.path); - afr_sh_data_finish (frame, this); - return 0; - } - sh->active_sinks = active_sinks; - - gf_log (this->name, GF_LOG_DEBUG, - "self-healing file %s from subvolume %s to %d other", - local->loc.path, priv->children[source]->name, active_sinks); - - sh->algo_completion_cbk = afr_sh_data_trim_sinks; - sh->algo_abort_cbk = afr_sh_data_finish; - - sh_algo = afr_sh_data_pick_algo (frame, this); - - sh_algo->fn (frame, this); - - return 0; + for (i = 0; i < priv->child_count; i++) { + if (sources[i] && witness[i]) + return _gf_true; + } + return _gf_false; } - -int -afr_sh_data_fix (call_frame_t *frame, xlator_t *this) +static gf_boolean_t +afr_does_size_mismatch(xlator_t *this, unsigned char *sources, + struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_local_t * orig_local = NULL; + int i = 0; + afr_private_t *priv = NULL; + struct iatt *min = NULL; + struct iatt *max = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int source = 0; - int i = 0; + priv = this->private; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; - afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr, - priv->child_count, AFR_DATA_TRANSACTION); + if (replies[i].op_ret < 0) + continue; - afr_sh_print_pending_matrix (sh->pending_matrix, this); + if (!sources[i]) + continue; - nsources = afr_sh_mark_sources (sh, priv->child_count, - AFR_SELF_HEAL_DATA); + if (AFR_IS_ARBITER_BRICK(priv, i) && (replies[i].poststat.ia_size == 0)) + continue; - afr_sh_supress_errenous_children (sh->sources, sh->child_errno, - priv->child_count); + if (!min) + min = &replies[i].poststat; - if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); + if (!max) + max = &replies[i].poststat; - afr_sh_data_finish (frame, this); - return 0; - } - - if ((nsources == -1) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { - - gf_log (this->name, GF_LOG_DEBUG, - "Picking favorite child %s as authentic source to resolve conflicting data of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); - - sh->sources[priv->favorite_child] = 1; - - nsources = afr_sh_source_count (sh->sources, - priv->child_count); - } - - if (nsources == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to self-heal contents of '%s' (possible split-brain). " - "Please delete the file from all but the preferred " - "subvolume.", local->loc.path); - - local->govinda_gOvinda = 1; - - afr_sh_data_finish (frame, this); - return 0; - } - - source = afr_sh_select_source (sh->sources, priv->child_count); - - if (source == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "No active sources found."); - - afr_sh_data_finish (frame, this); - return 0; - } - - sh->source = source; - sh->block_size = 65536; - sh->file_size = sh->buf[source].ia_size; - - if (FILE_HAS_HOLES (&sh->buf[source])) - sh->file_has_holes = 1; - - orig_local = sh->orig_frame->local; - orig_local->cont.lookup.buf.ia_size = sh->buf[source].ia_size; + if (min->ia_size > replies[i].poststat.ia_size) + min = &replies[i].poststat; - /* detect changes not visible through pending flags -- JIC */ - for (i = 0; i < priv->child_count; i++) { - if (i == source || sh->child_errno[i]) - continue; + if (max->ia_size < replies[i].poststat.ia_size) + max = &replies[i].poststat; + } - if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - } + if (min && max) { + if (min->ia_size != max->ia_size) + return _gf_true; + } - afr_set_read_child (this, local->loc.inode, sh->source); - - /* - quick-read might have read the file, so send xattr from - the source subvolume (http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=815) - */ - - dict_unref (orig_local->cont.lookup.xattr); - if (orig_local->cont.lookup.xattrs) - orig_local->cont.lookup.xattr = dict_ref (orig_local->cont.lookup.xattrs[sh->source]); - - if (sh->background) { - sh->unwind (sh->orig_frame, this); - sh->unwound = _gf_true; - } - - afr_sh_data_sync_prepare (frame, this); - - return 0; + return _gf_false; } - -int -afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr) +static void +afr_mark_biggest_witness_as_source(xlator_t *this, unsigned char *sources, + uint64_t *witness) { - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - int source = 0; - int i = 0; - - sh = &local->self_heal; - priv = this->private; - - sh->pending_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); - for (i = 0; i < priv->child_count; i++) { - sh->pending_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); - } - - sh->sources = GF_CALLOC (priv->child_count, sizeof (*sh->sources), - gf_afr_mt_int32_t); - - afr_sh_build_pending_matrix (priv, sh->pending_matrix, xattr, - priv->child_count, AFR_DATA_TRANSACTION); - - (void)afr_sh_mark_sources (sh, priv->child_count, AFR_SELF_HEAL_DATA); - - source = afr_sh_select_source (sh->sources, priv->child_count); - - return source; + int i = 0; + afr_private_t *priv = NULL; + uint64_t biggest_witness = 0; + + priv = this->private; + /* Find source with biggest witness count */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (biggest_witness < witness[i]) + biggest_witness = witness[i]; + } + + /* Mark files with less witness count as not source */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (witness[i] < biggest_witness) + sources[i] = 0; + } + + return; } - -int -afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf) +/* This is a tie breaker function. Only one source be assigned here */ +static void +afr_mark_newest_file_as_source(xlator_t *this, unsigned char *sources, + struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - gf_log (this->name, GF_LOG_TRACE, - "fstat of %s on %s succeeded", - local->loc.path, - priv->children[child_index]->name); - - sh->buf[child_index] = *buf; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_fix (frame, this); - } + int i = 0; + afr_private_t *priv = NULL; + int source = -1; + uint32_t max_ctime = 0; + + priv = this->private; + /* Find source with latest ctime */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + + if (max_ctime <= replies[i].poststat.ia_ctime) { + source = i; + max_ctime = replies[i].poststat.ia_ctime; + } + } - return 0; + /* Only mark one of the files as source to break ties */ + memset(sources, 0, sizeof(*sources) * priv->child_count); + sources[source] = 1; } - -int -afr_sh_data_fstat (call_frame_t *frame, xlator_t *this) +static int +__afr_selfheal_data_finalize_source( + call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + unsigned char *undid_pending, struct afr_reply *replies, uint64_t *witness) { - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = 0; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - call_count = afr_up_children_count (priv->child_count, - local->child_up); - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fstat, - sh->healing_fd); - - if (!--call_count) - break; - } - } + afr_private_t *priv = NULL; + int source = -1; + int sources_count = 0; + priv = this->private; + + sources_count = AFR_COUNT(sources, priv->child_count); + + if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) || + !sources_count) { + /* split brain */ + source = afr_mark_split_brain_source_sinks( + frame, this, inode, sources, sinks, healed_sinks, locked_on, + replies, AFR_DATA_TRANSACTION); + if (source < 0) { + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;type=data;" + "file=%s", + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(inode->gfid)); + return -EIO; + } - return 0; + _afr_fav_child_reset_sink_xattrs( + frame, this, inode, source, healed_sinks, undid_pending, + AFR_DATA_TRANSACTION, locked_on, replies); + goto out; + } + + /* No split brain at this point. If we were called from + * afr_heal_splitbrain_file(), abort.*/ + if (afr_dict_contains_heal_op(frame)) + return -EIO; + + /* If there are no witnesses/size-mismatches on sources we are done*/ + if (!afr_does_size_mismatch(this, sources, replies) && + !afr_has_source_witnesses(this, sources, witness)) + goto out; + + afr_mark_largest_file_as_source(this, sources, replies); + afr_mark_biggest_witness_as_source(this, sources, witness); + afr_mark_newest_file_as_source(this, sources, replies); + if (priv->arbiter_count) + /* Choose non-arbiter brick as source for empty files. */ + afr_mark_source_sinks_if_file_empty(this, sources, sinks, healed_sinks, + locked_on, replies, + AFR_DATA_TRANSACTION); + +out: + afr_mark_active_sinks(this, sources, locked_on, healed_sinks); + source = afr_choose_source_by_policy(priv, sources, AFR_DATA_TRANSACTION); + + return source; } - +/* + * __afr_selfheal_data_prepare: + * + * This function inspects the on-disk xattrs and determines which subvols + * are sources and sinks. + * + * The return value is the index of the subvolume to be used as the source + * for self-healing, or -1 if no healing is necessary/split brain. + */ int -afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) +__afr_selfheal_data_prepare(call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + unsigned char *undid_pending, + struct afr_reply *replies, unsigned char *pflag) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + uint64_t *witness = NULL; - LOCK (&frame->lock); - { - if (op_ret != -1) { - gf_log (this->name, GF_LOG_TRACE, - "fxattrop of %s on %s succeeded", - local->loc.path, - priv->children[child_index]->name); + priv = this->private; - sh->xattr[child_index] = dict_ref (xattr); - } - } - UNLOCK (&frame->lock); + ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); - call_count = afr_frame_return (frame); + if (ret) + return ret; - if (call_count == 0) { - afr_sh_data_fstat (frame, this); - } + witness = alloca0(priv->child_count * sizeof(*witness)); + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_DATA_TRANSACTION, locked_on, sources, + sinks, witness, pflag); + if (ret) + return ret; - return 0; + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count); + + source = __afr_selfheal_data_finalize_source( + frame, this, inode, sources, sinks, healed_sinks, locked_on, + undid_pending, replies, witness); + if (source < 0) + return -EIO; + + return source; } - -int -afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this) +static int +__afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on) { - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - dict_t *xattr_req = NULL; - - int32_t zero_pending[3] = {0, 0, 0}; - - int call_count = 0; - int i = 0; - int ret = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - call_count = afr_up_children_count (priv->child_count, - local->child_up); - - local->call_count = call_count; - - xattr_req = dict_new(); - if (xattr_req) { - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_static_bin (xattr_req, priv->pending_key[i], - zero_pending, 3 * sizeof(int32_t)); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value"); - } + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *undid_pending = NULL; + struct afr_reply *locked_replies = NULL; + int source = -1; + gf_boolean_t did_sh = _gf_true; + gf_boolean_t is_arbiter_the_only_sink = _gf_false; + gf_boolean_t empty_file = _gf_false; + + priv = this->private; + + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + healed_sinks = alloca0(priv->child_count); + data_lock = alloca0(priv->child_count); + undid_pending = alloca0(priv->child_count); + + locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count); + + ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0, + data_lock); + { + if (ret < priv->child_count) { + gf_msg_debug(this->name, 0, + "%s: Skipping " + "self-heal as only %d number " + "of subvolumes " + "could be locked", + uuid_utoa(fd->inode->gfid), ret); + ret = -ENOTCONN; + goto unlock; } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - sh->healing_fd, GF_XATTROP_ADD_ARRAY, - xattr_req); - - if (!--call_count) - break; - } - } - - if (xattr_req) - dict_unref (xattr_req); + ret = __afr_selfheal_data_prepare(frame, this, fd->inode, data_lock, + sources, sinks, healed_sinks, + undid_pending, locked_replies, NULL); + if (ret < 0) + goto unlock; - return 0; -} - - -int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this); - -int -afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + if (AFR_COUNT(healed_sinks, priv->child_count) == 0) { + did_sh = _gf_false; + goto unlock; + } - local = frame->local; - int_lock = &local->internal_lock; + source = ret; - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Non Blocking inodelks failed."); - afr_sh_data_done (frame, this); - } else { + if (AFR_IS_ARBITER_BRICK(priv, source)) { + empty_file = afr_is_file_empty_on_all_children(priv, + locked_replies); + if (empty_file) + goto restore_time; - gf_log (this->name, GF_LOG_DEBUG, - "Non Blocking inodelks done. Proceeding to FOP"); - afr_sh_data_fxattrop (frame, this); + did_sh = _gf_false; + goto unlock; } - return 0; + ret = __afr_selfheal_truncate_sinks( + frame, this, fd, healed_sinks, + locked_replies[source].poststat.ia_size); + if (ret < 0) + goto unlock; + + if (priv->arbiter_count && + AFR_COUNT(healed_sinks, priv->child_count) == 1 && + healed_sinks[ARBITER_BRICK_INDEX]) { + is_arbiter_the_only_sink = _gf_true; + goto restore_time; + } + ret = 0; + } +unlock: + afr_selfheal_uninodelk(frame, this, fd->inode, this->name, 0, 0, data_lock); + if (ret < 0) + goto out; + + if (!did_sh) + goto out; + + ret = afr_selfheal_data_do(frame, this, fd, source, healed_sinks, + locked_replies); + if (ret) + goto out; +restore_time: + afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks, + locked_replies); + + if (!is_arbiter_the_only_sink && !empty_file) { + ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0, + data_lock); + if (ret < priv->child_count) { + ret = -ENOTCONN; + did_sh = _gf_false; + goto skip_undo_pending; + } + } + ret = afr_selfheal_undo_pending( + frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending, + AFR_DATA_TRANSACTION, locked_replies, data_lock); +skip_undo_pending: + afr_selfheal_uninodelk(frame, this, fd->inode, this->name, 0, 0, data_lock); +out: + + if (did_sh) + afr_log_selfheal(fd->inode->gfid, this, ret, "data", source, sources, + healed_sinks); + else + ret = 1; + + if (locked_replies) + afr_replies_wipe(locked_replies, priv->child_count); + + return ret; } int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this) +afr_selfheal_data_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; + afr_local_t *local = NULL; + int i = (long)cookie; - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_DATA_SELF_HEAL_LK; + local = frame->local; - afr_set_lock_number (frame, this); + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; - int_lock->lk_flock.l_start = 0; - int_lock->lk_flock.l_len = 0; - int_lock->lk_flock.l_type = F_WRLCK; - int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk; + syncbarrier_wake(&local->barrier); - afr_nonblocking_inodelk (frame, this); - - - return 0; + return 0; } - int -afr_sh_data_lock (call_frame_t *frame, xlator_t *this) +afr_selfheal_data_open(xlator_t *this, inode_t *inode, fd_t **fd) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t * sh = NULL; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (sh->data_lock_held) { - /* caller has held the lock already, - so skip locking */ - - afr_sh_data_fxattrop (frame, this); - return 0; + int ret = 0; + fd_t *fd_tmp = NULL; + loc_t loc = { + 0, + }; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + fd_tmp = fd_create(inode, 0); + if (!fd_tmp) + return -ENOMEM; + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + frame = afr_frame_create(this, &ret); + if (!frame) { + ret = -ret; + fd_unref(fd_tmp); + goto out; + } + local = frame->local; + + AFR_ONLIST(local->child_up, frame, afr_selfheal_data_open_cbk, open, &loc, + O_RDWR | O_LARGEFILE, fd_tmp, NULL); + + ret = -ENOTCONN; + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + + if (local->replies[i].op_ret < 0) { + ret = -local->replies[i].op_errno; + continue; } - return afr_sh_data_lock_rec (frame, this); + ret = 0; + break; + } + + if (ret < 0) { + fd_unref(fd_tmp); + goto out; + } else { + fd_bind(fd_tmp); + } + + *fd = fd_tmp; +out: + loc_wipe(&loc); + if (frame) + AFR_STACK_DESTROY(frame); + return ret; } - -int -afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - /* TODO: some of the open's might fail. - In that case, modify cleanup fn to send flush on those - fd's which are already open */ - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, - "open of %s failed on child %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - sh->op_failed = 1; - } - - gf_log (this->name, GF_LOG_TRACE, - "open of %s succeeded on child %s", - local->loc.path, - priv->children[child_index]->name); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (sh->op_failed) { - afr_sh_data_finish (frame, this); - return 0; - } - - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened, commencing sync", - local->loc.path); - - afr_sh_data_lock (frame, this); - } - - return 0; -} - - int -afr_sh_data_open (call_frame_t *frame, xlator_t *this) +afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd) { - int i = 0; - int call_count = 0; - - fd_t *fd = NULL; - - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (sh->healing_fd_opened) { - /* caller has opened the fd for us already, so skip open */ - - afr_sh_data_lock (frame, this); - return 0; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + int ret = 0; + inode_t *inode = fd->inode; + + priv = this->private; + + locked_on = alloca0(priv->child_count); + + ret = afr_selfheal_tie_breaker_inodelk(frame, this, inode, priv->sh_domain, + 0, 0, locked_on); + { + if (ret < priv->child_count) { + gf_msg_debug(this->name, 0, + "%s: Skipping " + "self-heal as only %d number of " + "subvolumes could be locked", + uuid_utoa(fd->inode->gfid), ret); + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; } - call_count = afr_up_children_count (priv->child_count, local->child_up); - local->call_count = call_count; - - fd = fd_create (local->loc.inode, frame->root->pid); - sh->healing_fd = fd; + ret = __afr_selfheal_data(frame, this, fd, locked_on); + } +unlock: + afr_selfheal_uninodelk(frame, this, inode, priv->sh_domain, 0, 0, + locked_on); - /* open sinks */ - for (i = 0; i < priv->child_count; i++) { - if(!local->child_up[i]) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->open, - &local->loc, - O_RDWR|O_LARGEFILE, fd, 0); - - if (!--call_count) - break; - } - - return 0; + return ret; } - - -int -afr_self_heal_data (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = this->private; - - - local = frame->local; - sh = &local->self_heal; - - if (sh->need_data_self_heal && priv->data_self_heal) { - afr_sh_data_open (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "not doing data self heal on %s", - local->loc.path); - afr_sh_data_done (frame, this); - } - - return 0; -} - diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 2fac06a5283..64893f441e3 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -1,2311 +1,1276 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "inode.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" - -int -afr_sh_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this); +#include <glusterfs/byte-order.h> +#include "afr-transaction.h" +#include "afr-messages.h" +#include <glusterfs/syncop-utils.h> +#include <glusterfs/events.h> int -afr_sh_entry_done (call_frame_t *frame, xlator_t *this) +afr_selfheal_entry_anon_inode(xlator_t *this, inode_t *dir, const char *name, + inode_t *inode, int child, + struct afr_reply *replies, + gf_boolean_t *anon_inode) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - /* - TODO: cleanup sh->* - */ - - if (sh->healing_fd) - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - -/* for (i = 0; i < priv->child_count; i++) { */ -/* sh->locked_nodes[i] = 0; */ -/* } */ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t *subvol = NULL; + int ret = 0; + int i = 0; + char g[64] = {0}; + unsigned char *lookup_success = NULL; + call_frame_t *frame = NULL; + loc_t loc2 = { + 0, + }; + loc_t loc = { + 0, + }; + + priv = this->private; + subvol = priv->children[child]; + lookup_success = alloca0(priv->child_count); + uuid_utoa_r(replies[child].poststat.ia_gfid, g); + loc.inode = inode_new(inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + + if (replies[child].poststat.ia_type == IA_IFDIR) { + /* This directory may have sub-directory hierarchy which may need to + * be preserved for subsequent heals. So unconditionally move the + * directory to anonymous-inode directory*/ + *anon_inode = _gf_true; + goto anon_inode; + } + + frame = afr_frame_create(this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + local = frame->local; + gf_uuid_copy(loc.gfid, replies[child].poststat.ia_gfid); + AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc, + NULL); + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == 0) { + lookup_success[i] = 1; + } else if (local->replies[i].op_errno != ENOENT && + local->replies[i].op_errno != ESTALE) { + ret = -local->replies[i].op_errno; + } + } - gf_log (this->name, GF_LOG_TRACE, - "self heal of %s completed", - local->loc.path); + if (priv->quorum_count) { + if (afr_has_quorum(lookup_success, this, NULL)) { + *anon_inode = _gf_true; + } + } else if (AFR_COUNT(lookup_success, priv->child_count) > 1) { + *anon_inode = _gf_true; + } else if (ret) { + goto out; + } + +anon_inode: + if (!*anon_inode) { + ret = 0; + goto out; + } + + loc.parent = inode_ref(dir); + gf_uuid_copy(loc.pargfid, dir->gfid); + loc.name = name; + + ret = afr_anon_inode_create(this, child, &loc2.parent); + if (ret < 0) + goto out; + + loc2.name = g; + ret = syncop_rename(subvol, &loc, &loc2, NULL, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "Rename to %s dir %s/%s (%s) on %s failed", + priv->anon_inode_name, uuid_utoa(dir->gfid), name, g, + subvol->name); + } else { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "Rename to %s dir %s/%s (%s) on %s successful", + priv->anon_inode_name, uuid_utoa(dir->gfid), name, g, + subvol->name); + } - sh->completion_cbk (frame, this); +out: + loc_wipe(&loc); + loc_wipe(&loc2); + if (frame) { + AFR_STACK_DESTROY(frame); + } - return 0; + return ret; } - int -afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this) +afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, + inode_t *inode, int child, struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->lock_cbk = afr_sh_entry_done; - afr_unlock (frame, this); + char g[64] = {0}; + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + int ret = 0; + loc_t loc = { + 0, + }; + gf_boolean_t anon_inode = _gf_false; + + priv = this->private; + subvol = priv->children[child]; + + if ((!replies[child].valid) || (replies[child].op_ret < 0)) { + /*Nothing to do*/ + ret = 0; + goto out; + } + + if (priv->use_anon_inode) { + ret = afr_selfheal_entry_anon_inode(this, dir, name, inode, child, + replies, &anon_inode); + if (ret < 0 || anon_inode) + goto out; + } + + loc.parent = inode_ref(dir); + loc.inode = inode_new(inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + loc.name = name; + switch (replies[child].poststat.ia_type) { + case IA_IFDIR: + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), name, + uuid_utoa_r(replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL); + break; + default: + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid), + name, uuid_utoa_r(replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_unlink(subvol, &loc, NULL, NULL); + break; + } - return 0; +out: + loc_wipe(&loc); + return ret; } - int -afr_sh_entry_finish (call_frame_t *frame, xlator_t *this) +afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, + unsigned char *sources, inode_t *dir, + const char *name, inode_t *inode, + struct afr_reply *replies) { - afr_local_t *local = NULL; - - local = frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "finishing entry selfheal of %s", local->loc.path); - - afr_sh_entry_unlock (frame, this); + int ret = 0; + loc_t loc = { + 0, + }; + loc_t srcloc = { + 0, + }; + loc_t anonloc = { + 0, + }; + xlator_t *this = frame->this; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + struct iatt *iatt = NULL; + char *linkname = NULL; + mode_t mode = 0; + struct iatt newent = { + 0, + }; + unsigned char *newentry = NULL; + char iatt_uuid_str[64] = {0}; + char dir_uuid_str[64] = {0}; + + priv = this->private; + iatt = &replies[source].poststat; + uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str); + if (iatt->ia_type == IA_INVAL || gf_uuid_is_null(iatt->ia_gfid)) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SELF_HEAL_FAILED, + "Invalid ia_type (%d) or gfid(%s). source brick=%d, " + "pargfid=%s, name=%s", + iatt->ia_type, iatt_uuid_str, source, + uuid_utoa_r(dir->gfid, dir_uuid_str), name); + ret = -EINVAL; + goto out; + } + + xdata = dict_new(); + if (!xdata) + return -ENOMEM; + newentry = alloca0(priv->child_count); + loc.parent = inode_ref(dir); + gf_uuid_copy(loc.pargfid, dir->gfid); + loc.name = name; + loc.inode = inode_ref(inode); + + ret = afr_selfheal_entry_delete(this, dir, name, inode, dst, replies); + if (ret) + goto out; + + ret = dict_set_gfuuid(xdata, "gfid-req", replies[source].poststat.ia_gfid, + true); + if (ret) + goto out; + + srcloc.inode = inode_ref(inode); + gf_uuid_copy(srcloc.gfid, iatt->ia_gfid); + ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0); + if (ret == -ENOENT || ret == -ESTALE) { + newentry[dst] = 1; + ret = afr_selfheal_newentry_mark(frame, this, inode, source, replies, + sources, newentry); + if (ret) + goto out; + } else if (ret == 0 && iatt->ia_type == IA_IFDIR && priv->use_anon_inode) { + // Try rename from hidden directory + ret = afr_anon_inode_create(this, dst, &anonloc.parent); + if (ret < 0) + goto out; + anonloc.inode = inode_ref(inode); + anonloc.name = iatt_uuid_str; + ret = syncop_rename(priv->children[dst], &anonloc, &loc, NULL, NULL); + if (ret == -ENOENT || ret == -ESTALE) + ret = -1; /*This sets 'mismatch' to true*/ + goto out; + } + + mode = st_mode_from_ia(iatt->ia_prot, iatt->ia_type); + + switch (iatt->ia_type) { + case IA_IFDIR: + ret = syncop_mkdir(priv->children[dst], &loc, mode, 0, xdata, NULL); + break; + case IA_IFLNK: + if (!newentry[dst]) { + ret = syncop_link(priv->children[dst], &srcloc, &loc, &newent, + NULL, NULL); + } else { + ret = syncop_readlink(priv->children[source], &srcloc, + &linkname, 4096, NULL, NULL); + if (ret <= 0) + goto out; + ret = syncop_symlink(priv->children[dst], &loc, linkname, NULL, + xdata, NULL); + } + break; + default: + ret = dict_set_int32_sizen(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1); + if (ret) + goto out; + ret = syncop_mknod( + priv->children[dst], &loc, mode, + makedev(ia_major(iatt->ia_rdev), ia_minor(iatt->ia_rdev)), + &newent, xdata, NULL); + break; + } - return 0; +out: + if (xdata) + dict_unref(xdata); + GF_FREE(linkname); + loc_wipe(&loc); + loc_wipe(&srcloc); + loc_wipe(&anonloc); + return ret; } - -int -afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) +static int +__afr_selfheal_heal_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, int source, + unsigned char *sources, unsigned char *healed_sinks, + unsigned char *locked_on, struct afr_reply *replies) { - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *orig_local = NULL; - call_frame_t *orig_frame = NULL; + int ret = 0; + afr_private_t *priv = NULL; + int i = 0; - call_count = afr_frame_return (frame); + priv = this->private; - if (call_count == 0) { - local = frame->local; - sh = &local->self_heal; + if (!replies[source].valid) + return -EIO; - orig_frame = sh->orig_frame; - orig_local = orig_frame->local; + /* Skip healing this entry if the last lookup on it failed for reasons + * other than ENOENT. + */ + if ((replies[source].op_ret < 0) && (replies[source].op_errno != ENOENT)) + return -replies[source].op_errno; - if (sh->source != -1) { - orig_local->cont.lookup.buf.ia_nlink = sh->buf[sh->source].ia_nlink; - } + if (replies[source].op_ret == 0) { + ret = afr_lookup_and_heal_gfid(this, fd->inode, name, inode, replies, + source, sources, + &replies[source].poststat.ia_gfid, NULL); + if (ret) + return ret; + } + + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + if (replies[source].op_ret == -1 && + replies[source].op_errno == ENOENT) { + ret = afr_selfheal_entry_delete(this, fd->inode, name, inode, i, + replies); + } else { + if (!gf_uuid_compare(replies[i].poststat.ia_gfid, + replies[source].poststat.ia_gfid)) + continue; - afr_sh_entry_finish (frame, this); + ret = afr_selfheal_recreate_entry(frame, i, source, sources, + fd->inode, name, inode, replies); } + if (ret < 0) + break; + } - return 0; + return ret; } - -int -afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - int need_unwind = 0; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success, - priv->child_count, AFR_ENTRY_TRANSACTION); - - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } - } - - if (call_count == 0) - need_unwind = 1; - - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_ENTRY_TRANSACTION); - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } - - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - GF_FREE (erase_xattr); - - if (need_unwind) - afr_sh_entry_finish (frame, this); - - return 0; -} - - - static int -next_active_source (call_frame_t *frame, xlator_t *this, - int current_active_source) +afr_selfheal_detect_gfid_and_type_mismatch(xlator_t *this, + struct afr_reply *replies, + inode_t *inode, uuid_t pargfid, + char *bname, int src_idx, + unsigned char *locked_on, int *src) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int source = -1; - int next_active_source = -1; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - source = sh->source; - - if (source != -1) { - if (current_active_source != source) - next_active_source = source; - goto out; - } - - /* - the next active sink becomes the source for the - 'conservative decision' of merging all entries - */ - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) - && (local->child_up[i] == 1) - && (i > current_active_source)) { - - next_active_source = i; - break; - } - } -out: - return next_active_source; -} + int i = 0; + int ret = -1; + afr_private_t *priv = NULL; + void *gfid = NULL; + ia_type_t ia_type = IA_INVAL; + priv = this->private; + gfid = &replies[src_idx].poststat.ia_gfid; + ia_type = replies[src_idx].poststat.ia_type; + for (i = 0; i < priv->child_count; i++) { + if (i == src_idx) + continue; -static int -next_active_sink (call_frame_t *frame, xlator_t *this, - int current_active_sink) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int next_active_sink = -1; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - /* - the next active sink becomes the source for the - 'conservative decision' of merging all entries - */ - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) - && (local->child_up[i] == 1) - && (i > current_active_sink)) { - - next_active_sink = i; - break; - } - } - - return next_active_sink; -} + if (!replies[i].valid) + continue; + if (replies[i].op_ret != 0) + continue; -int -build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) -{ - int ret = -1; - - if (!child) { - goto out; - } + if (gf_uuid_is_null(replies[i].poststat.ia_gfid)) + continue; - if (strcmp (parent->path, "/") == 0) - ret = gf_asprintf ((char **)&child->path, "/%s", name); - else - ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, - name); + if (replies[i].poststat.ia_type == IA_INVAL) + continue; - if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed while setting child path"); + if (ia_type == IA_INVAL || gf_uuid_is_null(gfid)) { + src_idx = i; + ia_type = replies[src_idx].poststat.ia_type; + gfid = &replies[src_idx].poststat.ia_gfid; + continue; } - if (!child->path) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - child->name = strrchr (child->path, '/'); - if (child->name) - child->name++; - - child->parent = inode_ref (parent->inode); - child->inode = inode_new (parent->inode->table); - - if (!child->inode) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ret = 0; -out: - if (ret == -1) - loc_wipe (child); - - return ret; -} - - -int -afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); - -int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src); - -int -afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this); - -int -afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src); - -int -afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this, - int active_src) -{ - int call_count = 0; - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_entry_expunge_subvol (frame, this, active_src); - - return 0; -} - -int -afr_sh_entry_expunge_parent_setattr_cbk (call_frame_t *expunge_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - call_frame_t *frame = NULL; - - int active_src = (long) cookie; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - - if (op_ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "setattr on parent directory of %s on subvolume %s failed: %s", - expunge_local->loc.path, - priv->children[active_src]->name, strerror (op_errno)); + if (gf_uuid_compare(gfid, replies[i].poststat.ia_gfid) && + (ia_type == replies[i].poststat.ia_type)) { + ret = afr_gfid_split_brain_source(this, replies, inode, pargfid, + bname, src_idx, i, locked_on, src, + NULL); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "Skipping conservative merge on the " + "file."); + return ret; } - AFR_STACK_DESTROY (expunge_frame); - afr_sh_entry_expunge_entry_done (frame, this, active_src); + if (ia_type != replies[i].poststat.ia_type) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "Type mismatch detected " + "for <gfid:%s>/%s>, %s on %s and %s on %s. " + "Skipping conservative merge on the file.", + uuid_utoa(pargfid), bname, + gf_inode_type_to_str(replies[i].poststat.ia_type), + priv->children[i]->name, + gf_inode_type_to_str(replies[src_idx].poststat.ia_type), + priv->children[src_idx]->name); + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;type=file;" + "file=<gfid:%s>/%s>;count=2;child-%d=%s;type-" + "%d=%s;child-%d=%s;type-%d=%s", + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(pargfid), bname, i, priv->children[i]->name, i, + gf_inode_type_to_str(replies[i].poststat.ia_type), src_idx, + priv->children[src_idx]->name, src_idx, + gf_inode_type_to_str(replies[src_idx].poststat.ia_type)); + return -1; + } + } - return 0; + return 0; } - -int -afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) +static int +__afr_selfheal_merge_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, unsigned char *sources, + unsigned char *healed_sinks, + unsigned char *locked_on, struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int active_src = 0; - call_frame_t *frame = NULL; - - int32_t valid = 0; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - - active_src = (long) cookie; - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "removed %s on %s", - expunge_local->loc.path, - priv->children[active_src]->name); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "removing %s on %s failed (%s)", - expunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - } - - valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - afr_build_parent_loc (&expunge_sh->parent_loc, &expunge_local->loc); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_parent_setattr_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->setattr, - &expunge_sh->parent_loc, - &expunge_sh->parentbuf, - valid); + int ret = 0; + int i = 0; + int source = -1; + int src = -1; + afr_private_t *priv = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid && replies[i].op_ret == 0) { + source = i; + break; + } + } + if (source == -1) { + /* entry got deleted in the mean time? */ return 0; -} - - -int -afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "expunging file %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->unlink, - &expunge_local->loc); - - return 0; -} - - - -int -afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_DEBUG, - "expunging directory %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); + } - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->rmdir, - &expunge_local->loc, 1); - - return 0; -} - - -int -afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, - int active_src, struct iatt *buf) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int source = 0; - call_frame_t *frame = NULL; - int type = 0; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - source = expunge_sh->source; - - type = buf->ia_type; - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - case IA_IFLNK: - afr_sh_entry_expunge_unlink (expunge_frame, this, active_src); - break; - case IA_IFDIR: - afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - expunge_local->loc.path, - priv->children[source]->name, type); - goto out; - break; - } - - return 0; -out: - AFR_STACK_DESTROY (expunge_frame); - afr_sh_entry_expunge_entry_done (frame, this, active_src); - - return 0; -} - - -int -afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *x, - struct iatt *postparent) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - active_src = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, - "lookup of %s on %s failed (%s)", - expunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - - afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf); - - return 0; -out: - AFR_STACK_DESTROY (expunge_frame); - afr_sh_entry_expunge_entry_done (frame, this, active_src); - - return 0; -} - - -int -afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->lookup, - &expunge_local->loc, 0); - - return 0; -} - - -int -afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *x, - struct iatt *postparent) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int source = 0; - call_frame_t *frame = NULL; - int active_src = 0; - int need_expunge = 0; - - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - active_src = expunge_sh->active_source; - source = (long) cookie; - - if (op_ret == -1 && op_errno == ENOENT) - need_expunge = 1; - - if (!uuid_is_null (expunge_sh->entrybuf.ia_gfid) && - !uuid_is_null (buf->ia_gfid) && - (uuid_compare (expunge_sh->entrybuf.ia_gfid, buf->ia_gfid) != 0)) { - char uuidbuf1[64]; - char uuidbuf2[64]; - gf_log (this->name, GF_LOG_DEBUG, - "entry %s found on %s with mismatching gfid (%s/%s)", - expunge_local->loc.path, - priv->children[source]->name, - uuid_utoa_r (expunge_sh->entrybuf.ia_gfid, uuidbuf1), - uuid_utoa_r (buf->ia_gfid, uuidbuf2)); - need_expunge = 1; + /* Set all the sources as 1, otheriwse newentry_mark won't be set */ + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid && replies[i].op_ret == 0) { + sources[i] = 1; + } + } + + ret = afr_lookup_and_heal_gfid(this, fd->inode, name, inode, replies, + source, sources, + &replies[source].poststat.ia_gfid, NULL); + if (ret) + return ret; + + /* In case of type mismatch / unable to resolve gfid mismatch on the + * entry, return -1.*/ + ret = afr_selfheal_detect_gfid_and_type_mismatch( + this, replies, inode, fd->inode->gfid, name, source, locked_on, &src); + + if (ret < 0) + return ret; + if (src != -1) { + source = src; + for (i = 0; i < priv->child_count; i++) { + if (i != src && replies[i].valid && + gf_uuid_compare(replies[src].poststat.ia_gfid, + replies[i].poststat.ia_gfid)) { + sources[i] = 0; + } + } + } + + for (i = 0; i < priv->child_count; i++) { + if (i == source || !healed_sinks[i]) + continue; + + if (src != -1) { + if (!gf_uuid_compare(replies[src].poststat.ia_gfid, + replies[i].poststat.ia_gfid)) + continue; + } else if (replies[i].op_errno != ENOENT) { + continue; } - if (need_expunge) { - gf_log (this->name, GF_LOG_TRACE, - "missing entry %s on %s", - expunge_local->loc.path, - priv->children[source]->name); - - if (postparent) - expunge_sh->parentbuf = *postparent; - - afr_sh_entry_expunge_purge (expunge_frame, this, active_src); - - return 0; - } - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "%s exists under %s", - expunge_local->loc.path, - priv->children[source]->name); - } else { - gf_log (this->name, GF_LOG_TRACE, - "looking up %s under %s failed (%s)", - expunge_local->loc.path, - priv->children[source]->name, - strerror (op_errno)); - } - - AFR_STACK_DESTROY (expunge_frame); - afr_sh_entry_expunge_entry_done (frame, this, active_src); - - return 0; -} - + ret |= afr_selfheal_recreate_entry(frame, i, source, sources, fd->inode, + name, inode, replies); + } -int -afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, - gf_dirent_t *entry) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int ret = -1; - call_frame_t *expunge_frame = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int active_src = 0; - int source = 0; - int op_errno = 0; - char *name = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - source = sh->source; - - name = entry->d_name; - - if ((strcmp (name, ".") == 0) - || (strcmp (name, "..") == 0) - || ((strcmp (local->loc.path, "/") == 0) - && (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0))) { - - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - name, local->loc.path); - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "inspecting existance of %s under %s", - name, local->loc.path); - - expunge_frame = copy_frame (frame); - if (!expunge_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (expunge_local, afr_local_t, out); - - expunge_frame->local = expunge_local; - expunge_sh = &expunge_local->self_heal; - expunge_sh->sh_frame = frame; - expunge_sh->active_source = active_src; - expunge_sh->entrybuf = entry->d_stat; - - - ret = build_child_loc (this, &expunge_local->loc, &local->loc, name); - if (ret != 0) { - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", expunge_local->loc.path, - priv->children[source]->name); - - STACK_WIND_COOKIE (expunge_frame, - afr_sh_entry_expunge_entry_cbk, - (void *) (long) source, - priv->children[source], - priv->children[source]->fops->lookup, - &expunge_local->loc, 0); - - ret = 0; -out: - if (ret == -1) - afr_sh_entry_expunge_entry_done (frame, this, active_src); - - return 0; + return ret; } - -int -afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) +static int +__afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, int source, + unsigned char *sources, unsigned char *healed_sinks, + unsigned char *locked_on, struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - gf_dirent_t *entry = NULL; - off_t last_offset = 0; - int active_src = 0; - int entry_count = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - - if (op_ret <= 0) { - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "readdir of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "readdir of %s on subvolume %s complete", - local->loc.path, - priv->children[active_src]->name); - } - - afr_sh_entry_expunge_all (frame, this); - return 0; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - entry_count++; - } - - gf_log (this->name, GF_LOG_TRACE, - "readdir'ed %d entries from %s", - entry_count, priv->children[active_src]->name); - - sh->offset = last_offset; - local->call_count = entry_count; - - list_for_each_entry (entry, &entries->list, list) { - afr_sh_entry_expunge_entry (frame, this, entry); - } - - return 0; + int ret = -1; + + if (source < 0) + ret = __afr_selfheal_merge_dirent(frame, this, fd, name, inode, sources, + healed_sinks, locked_on, replies); + else + ret = __afr_selfheal_heal_dirent(frame, this, fd, name, inode, source, + sources, healed_sinks, locked_on, + replies); + return ret; } -int -afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src) +static gf_boolean_t +is_full_heal_marker_present(xlator_t *this, dict_t *xdata, int idx) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, - priv->children[active_src], - priv->children[active_src]->fops->readdirp, - sh->healing_fd, sh->block_size, sh->offset); - - return 0; + int i = 0; + int pending[3] = { + 0, + }; + void *pending_raw = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + + if (!xdata) + return _gf_false; + + /* Iterate over each of the priv->pending_keys[] elements and then + * see if any of them have data segment non-zero. If they do, return + * true. Else return false. + */ + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw)) + continue; + + if (!pending_raw) + continue; + + memcpy(pending, pending_raw, sizeof(pending)); + if (ntoh32(pending[idx])) + return _gf_true; + } + + return _gf_false; } - -int -afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) +static gf_boolean_t +afr_need_full_heal(xlator_t *this, struct afr_reply *replies, int source, + unsigned char *healed_sinks, afr_transaction_type type) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int active_src = -1; + int i = 0; + int idx = 0; + afr_private_t *priv = NULL; - priv = this->private; - local = frame->local; - sh = &local->self_heal; + priv = this->private; - sh->offset = 0; + if (!priv->esh_granular) + return _gf_true; - if (sh->source == -1) { - gf_log (this->name, GF_LOG_TRACE, - "no active sources for %s to expunge entries", - local->loc.path); - goto out; - } + if (type != AFR_ENTRY_TRANSACTION) + return _gf_true; - active_src = next_active_sink (frame, this, sh->active_source); - sh->active_source = active_src; + priv = this->private; + idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION); - if (sh->op_failed) { - goto out; - } + /* If there is a clear source, check whether the full-heal-indicator + * is present in its xdata. Otherwise, we need to examine all the + * participating bricks and then figure if *even* one of them has a + * full-heal-indicator. + */ - if (active_src == -1) { - /* completed creating missing files on all subvolumes */ - goto out; - } + if (source != -1) { + if (is_full_heal_marker_present(this, replies[source].xdata, idx)) + return _gf_true; + } - gf_log (this->name, GF_LOG_TRACE, - "expunging entries of %s on %s to other sinks", - local->loc.path, priv->children[active_src]->name); + /* else ..*/ - afr_sh_entry_expunge_subvol (frame, this, active_src); + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; - return 0; -out: - afr_sh_entry_impunge_all (frame, this); - return 0; + if (is_full_heal_marker_present(this, replies[i].xdata, idx)) + return _gf_true; + } + return _gf_false; } - -int -afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this, - int active_src) +static int +__afr_selfheal_entry_finalize_source(xlator_t *this, unsigned char *sources, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies, + uint64_t *witness) { - int call_count = 0; - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_entry_impunge_subvol (frame, this, active_src); + afr_private_t *priv = NULL; + int source = -1; + int sources_count = 0; + int i = 0; + + priv = this->private; + + sources_count = AFR_COUNT(sources, priv->child_count); + + if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) || + !sources_count || afr_does_witness_exist(this, witness)) { + memset(sources, 0, sizeof(*sources) * priv->child_count); + afr_mark_active_sinks(this, sources, locked_on, healed_sinks); + return -1; + } + + source = afr_choose_source_by_policy(priv, sources, AFR_ENTRY_TRANSACTION); + + /*If the selected source does not blame any other brick, then mark + * everything as sink to trigger conservative merge. + */ + if (source != -1 && !AFR_COUNT(healed_sinks, priv->child_count)) { + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) { + sources[i] = 0; + healed_sinks[i] = 1; + } + } + return -1; + } - return 0; + return source; } - int -afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) +__afr_selfheal_entry_prepare(call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, + struct afr_reply *replies, int *source_p, + unsigned char *pflag) { - int call_count = 0; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; - int child_index = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - active_src = sh->active_source; - child_index = (long) cookie; - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "setattr done for %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "setattr (%s) on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - } - - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); - } - - return 0; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + uint64_t *witness = NULL; + + priv = this->private; + + ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); + if (ret) + return ret; + + witness = alloca0(sizeof(*witness) * priv->child_count); + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_ENTRY_TRANSACTION, locked_on, sources, + sinks, witness, pflag); + if (ret) + return ret; + + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count); + + source = __afr_selfheal_entry_finalize_source(this, sources, healed_sinks, + locked_on, replies, witness); + + if (source < 0) { + /* If source is < 0 (typically split-brain), we perform a + conservative merge of entries rather than erroring out */ + } + *source_p = source; + + return ret; } - -int -afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - dict_t *xattr) +static int +afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *parent_idx_inode, + xlator_t *subvol, gf_boolean_t full_crawl) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = 0; - - struct iatt stbuf; - int32_t valid = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - child_index = (long) cookie; - - gf_log (this->name, GF_LOG_TRACE, - "setting ownership of %s on %s to %d/%d", - impunge_local->loc.path, - priv->children[child_index]->name, - impunge_local->cont.lookup.buf.ia_uid, - impunge_local->cont.lookup.buf.ia_gid); - - stbuf.ia_atime = impunge_local->cont.lookup.buf.ia_atime; - stbuf.ia_atime_nsec = impunge_local->cont.lookup.buf.ia_atime_nsec; - stbuf.ia_mtime = impunge_local->cont.lookup.buf.ia_mtime; - stbuf.ia_mtime_nsec = impunge_local->cont.lookup.buf.ia_mtime_nsec; - - stbuf.ia_uid = impunge_local->cont.lookup.buf.ia_uid; - stbuf.ia_gid = impunge_local->cont.lookup.buf.ia_gid; - - valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_setattr_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->setattr, - &impunge_local->loc, - &stbuf, valid); + int ret = 0; + int source = -1; + unsigned char *locked_on = NULL; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *healed_sinks = NULL; + inode_t *inode = NULL; + struct afr_reply *replies = NULL; + struct afr_reply *par_replies = NULL; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + + priv = this->private; + + if (afr_is_private_directory(priv, fd->inode->gfid, name, + GF_CLIENT_PID_SELF_HEALD)) { return 0; -} - - -int -afr_sh_entry_impunge_parent_setattr_cbk (call_frame_t *setattr_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) -{ - loc_t *parent_loc = cookie; - - if (op_ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "setattr on parent directory failed: %s", - strerror (op_errno)); + } + + xattr = dict_new(); + if (!xattr) + return -ENOMEM; + ret = dict_set_int32_sizen(xattr, GF_GFIDLESS_LOOKUP, 1); + if (ret) { + dict_unref(xattr); + return -1; + } + + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + healed_sinks = alloca0(priv->child_count); + locked_on = alloca0(priv->child_count); + + replies = alloca0(priv->child_count * sizeof(*replies)); + par_replies = alloca0(priv->child_count * sizeof(*par_replies)); + + ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL, + locked_on); + { + if (ret < priv->child_count) { + gf_msg_debug(this->name, 0, + "%s: Skipping " + "entry self-heal as only %d sub-volumes " + " could be locked in %s domain", + uuid_utoa(fd->inode->gfid), ret, this->name); + ret = -ENOTCONN; + goto unlock; } - loc_wipe (parent_loc); - - GF_FREE (parent_loc); - - AFR_STACK_DESTROY (setattr_frame); - return 0; -} - - -int -afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, - struct iatt *postparent) -{ - int call_count = 0; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; - int child_index = 0; - int pending_array[3] = {0, }; - dict_t *xattr = NULL; - int ret = 0; - int idx = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - call_frame_t *setattr_frame = NULL; - int32_t valid = 0; - loc_t *parent_loc = NULL; - struct iatt parentbuf; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - active_src = sh->active_source; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "creation of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; - } - - inode->ia_type = stbuf->ia_type; - - xattr = get_new_dict (); - dict_ref (xattr); - - idx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); - pending_array[idx] = hton32 (1); - if (IA_ISDIR (stbuf->ia_type)) - idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); - else - idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - pending_array[idx] = hton32 (1); - - ret = dict_set_static_bin (xattr, priv->pending_key[child_index], - pending_array, sizeof (pending_array)); + ret = __afr_selfheal_entry_prepare(frame, this, fd->inode, locked_on, + sources, sinks, healed_sinks, + par_replies, &source, NULL); if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - - valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - parentbuf = impunge_sh->parentbuf; - setattr_frame = copy_frame (impunge_frame); - - parent_loc = GF_CALLOC (1, sizeof (*parent_loc), - gf_afr_mt_loc_t); - afr_build_parent_loc (parent_loc, &impunge_local->loc); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->xattrop, - &impunge_local->loc, GF_XATTROP_ADD_ARRAY, xattr); - - STACK_WIND_COOKIE (setattr_frame, afr_sh_entry_impunge_parent_setattr_cbk, - (void *) (long) parent_loc, - priv->children[child_index], - priv->children[child_index]->fops->setattr, - parent_loc, &parentbuf, valid); - - dict_unref (xattr); - - return 0; - -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); - } - - return 0; -} - - -int -afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - dict_t *dict = NULL; - - int ret = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing file %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - dict = dict_new (); - if (!dict) - gf_log (this->name, GF_LOG_ERROR, "Out of memory"); - - ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, "gfid set failed"); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->mknod, - &impunge_local->loc, - st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), - stbuf->ia_rdev, dict); - - if (dict) - dict_unref (dict); - - return 0; -} - - - -int -afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - dict_t *dict = NULL; - - int ret = 0; + goto unlock; - priv = this->private; - impunge_local = impunge_frame->local; - - dict = dict_new (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - return 0; + inode = afr_selfheal_unlocked_lookup_on(frame, fd->inode, name, replies, + locked_on, xattr); + if (!inode) { + ret = -ENOMEM; + goto unlock; } - ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, "gfid set failed"); - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing directory %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->mkdir, - &impunge_local->loc, - st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), - dict); - - if (dict) - dict_unref (dict); - - return 0; -} - - -int -afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index, const char *linkname) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - dict_t *dict = NULL; - struct iatt *buf = NULL; - - int ret = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - - buf = &impunge_local->cont.symlink.buf; - - dict = dict_new (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - afr_sh_entry_impunge_entry_done (impunge_frame, this, 0); + ret = __afr_selfheal_entry_dirent(frame, this, fd, name, inode, source, + sources, healed_sinks, locked_on, + replies); + + if ((ret == 0) && (priv->esh_granular) && parent_idx_inode) { + ret = afr_shd_entry_purge(subvol, parent_idx_inode, name, + inode->ia_type); + /* Why is ret force-set to 0? We do not care about + * index purge failing for full heal as it is quite + * possible during replace-brick that not all files + * and directories have their name indices present in + * entry-changes/. + */ + ret = 0; } - - ret = afr_set_dict_gfid (dict, buf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "dict set gfid failed"); - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing symlink %s -> %s on %s", - impunge_local->loc.path, linkname, - priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->symlink, - linkname, &impunge_local->loc, dict); - - if (dict) - dict_unref (dict); - - return 0; + } + +unlock: + afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, locked_on, + NULL); + if (inode) + inode_unref(inode); + if (replies) + afr_replies_wipe(replies, priv->child_count); + if (par_replies) + afr_replies_wipe(par_replies, priv->child_count); + if (xattr) + dict_unref(xattr); + + return ret; } - -int -afr_sh_entry_impunge_symlink_unlink_cbk (call_frame_t *impunge_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) +static inode_t * +afr_shd_entry_changes_index_inode(xlator_t *this, xlator_t *subvol, + uuid_t pargfid) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - call_frame_t *frame = NULL; - int call_count = -1; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "unlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; - } - - afr_sh_entry_impunge_symlink (impunge_frame, this, child_index, - impunge_sh->linkname); + int ret = -1; + void *index_gfid = NULL; + loc_t rootloc = { + 0, + }; + loc_t loc = { + 0, + }; + dict_t *xattr = NULL; + inode_t *inode = NULL; + struct iatt iatt = { + 0, + }; + + rootloc.inode = inode_ref(this->itable->root); + gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid); + + ret = syncop_getxattr(subvol, &rootloc, &xattr, + GF_XATTROP_ENTRY_CHANGES_GFID, NULL, NULL); + if (ret || !xattr) { + errno = -ret; + goto out; + } + + ret = dict_get_ptr(xattr, GF_XATTROP_ENTRY_CHANGES_GFID, &index_gfid); + if (ret) { + errno = EINVAL; + goto out; + } + + loc.inode = inode_new(this->itable); + if (!loc.inode) { + errno = ENOMEM; + goto out; + } + + gf_uuid_copy(loc.pargfid, index_gfid); + loc.name = gf_strdup(uuid_utoa(pargfid)); + + ret = syncop_lookup(subvol, &loc, &iatt, NULL, NULL, NULL); + if (ret < 0) { + errno = -ret; + goto out; + } + + inode = inode_link(loc.inode, NULL, NULL, &iatt); - return 0; out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); - } - - return 0; -} + if (xattr) + dict_unref(xattr); + loc_wipe(&rootloc); + GF_FREE((char *)loc.name); + loc_wipe(&loc); - -int -afr_sh_entry_impunge_symlink_unlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - - priv = this->private; - impunge_local = impunge_frame->local; - - gf_log (this->name, GF_LOG_DEBUG, - "unlinking symlink %s with wrong target on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_symlink_unlink_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->unlink, - &impunge_local->loc); - - return 0; + return inode; } - -int -afr_sh_entry_impunge_readlink_sink_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *linkname, struct iatt *sbuf) +static int +afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd, + int child) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - call_frame_t *frame = NULL; - int call_count = -1; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if ((op_ret == -1) && (op_errno != ENOENT)) { - gf_log (this->name, GF_LOG_DEBUG, - "readlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - - /* symlink doesn't exist on the sink */ - - if ((op_ret == -1) && (op_errno == ENOENT)) { - afr_sh_entry_impunge_symlink (impunge_frame, this, - child_index, impunge_sh->linkname); - return 0; - } - - - /* symlink exists on the sink, so check if targets match */ - - if (strcmp (linkname, impunge_sh->linkname) == 0) { - /* targets match, nothing to do */ - - goto out; - } else { - /* - * Hah! Sneaky wolf in sheep's clothing! - */ - afr_sh_entry_impunge_symlink_unlink (impunge_frame, this, - child_index); - return 0; + int ret = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + off_t offset = 0; + call_frame_t *iter_frame = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + gf_boolean_t mismatch = _gf_false; + afr_local_t *local = NULL; + loc_t loc = { + 0, + }; + + priv = this->private; + subvol = priv->children[child]; + + INIT_LIST_HEAD(&entries.list); + + local = frame->local; + + iter_frame = afr_copy_frame(frame); + if (!iter_frame) + return -ENOMEM; + + loc.inode = afr_shd_entry_changes_index_inode(this, subvol, + fd->inode->gfid); + + while ((ret = syncop_readdir(subvol, fd, 131072, offset, &entries, NULL, + NULL))) { + if (ret > 0) + ret = 0; + list_for_each_entry(entry, &entries.list, list) + { + offset = entry->d_off; + + if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) + continue; + + ret = afr_selfheal_entry_dirent(iter_frame, this, fd, entry->d_name, + loc.inode, subvol, + local->need_full_crawl); + AFR_STACK_RESET(iter_frame); + if (iter_frame->local == NULL) { + ret = -ENOTCONN; + break; + } + + if (ret == -1) { + /* gfid or type mismatch. */ + mismatch = _gf_true; + ret = 0; + } + if (ret) + break; } -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); - } - - return 0; -} - - -int -afr_sh_entry_impunge_readlink_sink (call_frame_t *impunge_frame, xlator_t *this, - int child_index) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - - priv = this->private; - impunge_local = impunge_frame->local; - - gf_log (this->name, GF_LOG_DEBUG, - "checking symlink target of %s on %s", - impunge_local->loc.path, priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_sink_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->readlink, - &impunge_local->loc, 4096); - - return 0; -} - - -int -afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *linkname, struct iatt *sbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - call_frame_t *frame = NULL; - int call_count = -1; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "readlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - - impunge_sh->linkname = gf_strdup (linkname); - afr_sh_entry_impunge_readlink_sink (impunge_frame, this, child_index); - - return 0; - -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); - } - - return 0; -} + gf_dirent_free(&entries); + if (ret) + break; + } + loc_wipe(&loc); -int -afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; - impunge_local->cont.symlink.buf = *stbuf; - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->readlink, - &impunge_local->loc, 4096); - - return 0; + AFR_STACK_DESTROY(iter_frame); + if (mismatch == _gf_true) + /* undo pending will be skipped */ + ret = -1; + return ret; } - -int -afr_sh_entry_impunge_recreate_lookup_cbk (call_frame_t *impunge_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - dict_t *xattr,struct iatt *postparent) +static int +afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry, + loc_t *parent, void *data) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = 0; - int type = 0; - int child_index = 0; - call_frame_t *frame = NULL; - int call_count = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - - child_index = (long) cookie; - - active_src = impunge_sh->active_source; - - if (op_ret != 0) { - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s (for %s) failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; - } - - impunge_sh->parentbuf = *postparent; - - impunge_local->cont.lookup.buf = *buf; - type = buf->ia_type; - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - afr_sh_entry_impunge_mknod (impunge_frame, this, - child_index, buf); - break; - case IA_IFLNK: - afr_sh_entry_impunge_readlink (impunge_frame, this, - child_index, buf); - break; - case IA_IFDIR: - afr_sh_entry_impunge_mkdir (impunge_frame, this, - child_index, buf); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - impunge_local->loc.path, - priv->children[active_src]->name, type); - goto out; - break; - } - - return 0; + int ret = 0; + loc_t loc = { + 0, + }; + struct iatt iatt = { + 0, + }; + afr_granular_esh_args_t *args = data; + + /* Look up the actual inode associated with entry. If the lookup returns + * ESTALE or ENOENT, then it means we have a stale index. Remove it. + * This is analogous to the check in afr_shd_index_heal() except that + * here it is achieved through LOOKUP and in afr_shd_index_heal() through + * a GETXATTR. + */ + + loc.inode = inode_new(args->xl->itable); + loc.parent = inode_ref(args->heal_fd->inode); + gf_uuid_copy(loc.pargfid, loc.parent->gfid); + loc.name = entry->d_name; + + ret = syncop_lookup(args->xl, &loc, &iatt, NULL, NULL, NULL); + if ((ret == -ENOENT) || (ret == -ESTALE)) { + /* The name indices under the pgfid index dir are guaranteed + * to be regular files. Hence the hardcoding. + */ + afr_shd_entry_purge(subvol, parent->inode, entry->d_name, IA_IFREG); + ret = 0; + goto out; + } + /* TBD: afr_shd_zero_xattrop? */ + + ret = afr_selfheal_entry_dirent(args->frame, args->xl, args->heal_fd, + entry->d_name, parent->inode, subvol, + _gf_false); + AFR_STACK_RESET(args->frame); + if (args->frame->local == NULL) + ret = -ENOTCONN; + + if (ret == -1) + args->mismatch = _gf_true; out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); - } - - return 0; + loc_wipe(&loc); + return 0; } - -int -afr_sh_entry_impunge_recreate (call_frame_t *impunge_frame, xlator_t *this, - int child_index) +static int +afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd, + int subvol_idx, gf_boolean_t is_src) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = 0; - - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - active_src = impunge_sh->active_source; - - STACK_WIND_COOKIE (impunge_frame, - afr_sh_entry_impunge_recreate_lookup_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->lookup, - &impunge_local->loc, 0); - - return 0; -} - + int ret = 0; + loc_t loc = { + 0, + }; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + afr_granular_esh_args_t args = { + 0, + }; + + priv = this->private; + subvol = priv->children[subvol_idx]; + + args.frame = afr_copy_frame(frame); + if (!args.frame) + goto out; + args.xl = this; + /* args.heal_fd represents the fd associated with the original directory + * on which entry heal is being attempted. + */ + args.heal_fd = fd; + + /* @subvol here represents the subvolume of AFR where + * indices/entry-changes/<pargfid> will be processed + */ + loc.inode = afr_shd_entry_changes_index_inode(this, subvol, + fd->inode->gfid); + if (!loc.inode) { + /* If granular heal failed on the sink (as it might sometimes + * because it is the src that would mostly contain the granular + * changelogs and the sink's entry-changes would be empty), + * do not treat heal as failure. + */ + if (is_src) + ret = -errno; + else + ret = 0; + goto out; + } -int -afr_sh_entry_impunge_entry_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *x, - struct iatt *postparent) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int call_count = 0; - int child_index = 0; - call_frame_t *frame = NULL; - int active_src = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - child_index = (long) cookie; - active_src = impunge_sh->active_source; - - if ((op_ret == -1 && op_errno == ENOENT) - || (IA_ISLNK (impunge_sh->impunging_entry_mode))) { - - /* - * A symlink's target might have changed, so - * always go down the recreate path for them. - */ - - /* decrease call_count in recreate-callback */ - - gf_log (this->name, GF_LOG_TRACE, - "missing entry %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - afr_sh_entry_impunge_recreate (impunge_frame, this, - child_index); - return 0; - } - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "%s exists under %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - impunge_sh->parentbuf = *postparent; - } else { - gf_log (this->name, GF_LOG_TRACE, - "looking up %s under %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - } - - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); - } - - return 0; -} + ret = syncop_dir_scan(subvol, &loc, GF_CLIENT_PID_SELF_HEALD, &args, + afr_selfheal_entry_granular_dirent); + loc_wipe(&loc); -int -afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, - gf_dirent_t *entry) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int ret = -1; - call_frame_t *impunge_frame = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = 0; - int i = 0; - int call_count = 0; - int op_errno = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - - if ((strcmp (entry->d_name, ".") == 0) - || (strcmp (entry->d_name, "..") == 0) - || ((strcmp (local->loc.path, "/") == 0) - && (strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR) == 0))) { - - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - entry->d_name, local->loc.path); - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "inspecting existance of %s under %s", - entry->d_name, local->loc.path); - - impunge_frame = copy_frame (frame); - if (!impunge_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (impunge_local, afr_local_t, out); - - impunge_frame->local = impunge_local; - impunge_sh = &impunge_local->self_heal; - impunge_sh->sh_frame = frame; - impunge_sh->active_source = active_src; - - impunge_sh->impunging_entry_mode = - st_mode_from_ia (entry->d_stat.ia_prot, entry->d_stat.ia_type); - - ret = build_child_loc (this, &impunge_local->loc, &local->loc, entry->d_name); - if (ret != 0) { - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - if (i == active_src) - continue; - if (local->child_up[i] == 0) - continue; - if (sh->sources[i] == 1) - continue; - call_count++; - } - - impunge_local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (i == active_src) - continue; - if (local->child_up[i] == 0) - continue; - if (sh->sources[i] == 1) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", impunge_local->loc.path, - priv->children[i]->name); - - STACK_WIND_COOKIE (impunge_frame, - afr_sh_entry_impunge_entry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &impunge_local->loc, 0); - - if (!--call_count) - break; - } - - ret = 0; + if (args.mismatch == _gf_true) + ret = -1; out: - if (ret == -1) - afr_sh_entry_impunge_entry_done (frame, this, active_src); - - return 0; -} - - -int -afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - gf_dirent_t *entry = NULL; - off_t last_offset = 0; - int active_src = 0; - int entry_count = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - - if (op_ret <= 0) { - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "readdir of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "readdir of %s on subvolume %s complete", - local->loc.path, - priv->children[active_src]->name); - } - - afr_sh_entry_impunge_all (frame, this); - return 0; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - entry_count++; - } - - gf_log (this->name, GF_LOG_TRACE, - "readdir'ed %d entries from %s", - entry_count, priv->children[active_src]->name); - - sh->offset = last_offset; - local->call_count = entry_count; - - list_for_each_entry (entry, &entries->list, list) { - afr_sh_entry_impunge_entry (frame, this, entry); - } - - return 0; -} - - -int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk, - priv->children[active_src], - priv->children[active_src]->fops->readdirp, - sh->healing_fd, sh->block_size, sh->offset); - - return 0; -} - - -int -afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int active_src = -1; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh->offset = 0; - - active_src = next_active_source (frame, this, sh->active_source); - sh->active_source = active_src; - - if (sh->op_failed) { - afr_sh_entry_finish (frame, this); - return 0; - } - - if (active_src == -1) { - /* completed creating missing files on all subvolumes */ - afr_sh_entry_erase_pending (frame, this); - return 0; - } - - gf_log (this->name, GF_LOG_TRACE, - "impunging entries of %s on %s to other sinks", - local->loc.path, priv->children[active_src]->name); - - afr_sh_entry_impunge_subvol (frame, this, active_src); - - return 0; -} - - -int -afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - /* TODO: some of the open's might fail. - In that case, modify cleanup fn to send flush on those - fd's which are already open */ - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "opendir of %s failed on child %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - sh->op_failed = 1; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (sh->op_failed) { - afr_sh_entry_finish (frame, this); - return 0; - } - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened, commencing sync", - local->loc.path); - - sh->active_source = -1; - afr_sh_entry_expunge_all (frame, this); - } - - return 0; -} - - -int -afr_sh_entry_open (call_frame_t *frame, xlator_t *this) -{ - int i = 0; - int call_count = 0; - - int source = -1; - int *sources = NULL; - - fd_t *fd = NULL; - - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = local->self_heal.source; - sources = local->self_heal.sources; - - sh->block_size = 65536; //131072 - sh->offset = 0; - - call_count = sh->active_sinks; - if (source != -1) - call_count++; - - local->call_count = call_count; - - fd = fd_create (local->loc.inode, frame->root->pid); - sh->healing_fd = fd; - - if (source != -1) { - gf_log (this->name, GF_LOG_TRACE, - "opening directory %s on subvolume %s (source)", - local->loc.path, priv->children[source]->name); - - /* open source */ - STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, - (void *) (long) source, - priv->children[source], - priv->children[source]->fops->opendir, - &local->loc, fd); - call_count--; - } - - /* open sinks */ - for (i = 0; i < priv->child_count; i++) { - if (sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "opening directory %s on subvolume %s (sink)", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->opendir, - &local->loc, fd); - - if (!--call_count) - break; - } - - return 0; + if (args.frame) + AFR_STACK_DESTROY(args.frame); + return ret; } - -int -afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int active_sinks = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; - } - } - if (source != -1) - sh->success[source] = 1; - - if (active_sinks == 0) { - gf_log (this->name, GF_LOG_TRACE, - "no active sinks for self-heal on dir %s", - local->loc.path); - afr_sh_entry_finish (frame, this); - return 0; - } - if (source == -1 && active_sinks < 2) { - gf_log (this->name, GF_LOG_TRACE, - "cannot sync with 0 sources and 1 sink on dir %s", - local->loc.path); - afr_sh_entry_finish (frame, this); - return 0; - } - sh->active_sinks = active_sinks; - - if (source != -1) - gf_log (this->name, GF_LOG_DEBUG, - "self-healing directory %s from subvolume %s to " - "%d other", - local->loc.path, priv->children[source]->name, - active_sinks); - else - gf_log (this->name, GF_LOG_DEBUG, - "no active sources for %s found. " - "merging all entries as a conservative decision", - local->loc.path); - - afr_sh_entry_open (frame, this); - - return 0; -} - - -int -afr_sh_entry_fix (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_entry_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source, + unsigned char *sources, unsigned char *healed_sinks) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - - int nsources = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (sh->forced_merge) { - sh->source = -1; - goto heal; - } - - afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr, - priv->child_count, AFR_ENTRY_TRANSACTION); - - afr_sh_print_pending_matrix (sh->pending_matrix, this); - - nsources = afr_sh_mark_sources (sh, priv->child_count, - AFR_SELF_HEAL_ENTRY); - - if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); + int i = 0; + int ret = 0; + gf_boolean_t mismatch = _gf_false; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, + "performing entry selfheal on %s", uuid_utoa(fd->inode->gfid)); + + for (i = 0; i < priv->child_count; i++) { + /* Expunge */ + if (!healed_sinks[i]) + continue; + + if (!local->need_full_crawl) + /* Why call afr_selfheal_entry_granular() on a "healed sink", + * given that it is the source that contains the granular + * indices? + * If the index for this directory is non-existent or empty on + * this subvol (=> clear sink), then it will return early + * without failure status. + * If the index is non-empty and it is yet a 'healed sink', then + * it is due to a split-brain in which case we anyway need to + * crawl the indices/entry-changes/pargfid directory. + */ + ret = afr_selfheal_entry_granular(frame, this, fd, i, _gf_false); + else + ret = afr_selfheal_entry_do_subvol(frame, this, fd, i); - afr_sh_entry_finish (frame, this); - return 0; + if (ret == -1) { + /* gfid or type mismatch. */ + mismatch = _gf_true; + ret = 0; } + if (ret) + break; + } - afr_sh_supress_errenous_children (sh->sources, sh->child_errno, - priv->child_count); - - source = afr_sh_select_source (sh->sources, priv->child_count); - - sh->source = source; - -heal: - afr_sh_entry_sync_prepare (frame, this); - - return 0; -} - - - -int -afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - sh->xattr[child_index] = dict_ref (xattr); - sh->buf[child_index] = *buf; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_entry_fix (frame, this); - } - - return 0; + if (!ret && source != -1) { + /* Impunge */ + if (local->need_full_crawl) + ret = afr_selfheal_entry_do_subvol(frame, this, fd, source); + else + ret = afr_selfheal_entry_granular(frame, this, fd, source, + _gf_true); + } + + if (mismatch == _gf_true) + /* undo pending will be skipped */ + ret = -1; + return ret; } - - -int -afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this) +static int +__afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - dict_t *xattr_req = NULL; - int ret = 0; - int call_count = 0; - int i = 0; - - priv = this->private; - local = frame->local; - - call_count = afr_up_children_count (priv->child_count, - local->child_up); - - local->call_count = call_count; - - xattr_req = dict_new(); - if (xattr_req) { - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (xattr_req, - priv->pending_key[i], - 3 * sizeof(int32_t)); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - } + int ret = -1; + int source = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *postop_lock = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *undid_pending = NULL; + struct afr_reply *locked_replies = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_boolean_t did_sh = _gf_true; + + priv = this->private; + local = frame->local; + + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + healed_sinks = alloca0(priv->child_count); + undid_pending = alloca0(priv->child_count); + data_lock = alloca0(priv->child_count); + postop_lock = alloca0(priv->child_count); + + locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count); + + ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL, + data_lock); + { + if (ret < priv->child_count) { + gf_msg_debug(this->name, 0, + "%s: Skipping " + "entry self-heal as only %d sub-volumes could " + "be locked in %s domain", + uuid_utoa(fd->inode->gfid), ret, this->name); + ret = -ENOTCONN; + goto unlock; } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, - afr_sh_entry_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &local->loc, xattr_req); - if (!--call_count) - break; - } - } - - if (xattr_req) - dict_unref (xattr_req); - - return 0; -} - -int -afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Non Blocking entrylks failed."); - afr_sh_entry_done (frame, this); - } else { + ret = __afr_selfheal_entry_prepare(frame, this, fd->inode, data_lock, + sources, sinks, healed_sinks, + locked_replies, &source, NULL); + if (AFR_COUNT(healed_sinks, priv->child_count) == 0) { + did_sh = _gf_false; + goto unlock; + } - gf_log (this->name, GF_LOG_DEBUG, - "Non Blocking entrylks done. Proceeding to FOP"); - afr_sh_entry_lookup(frame, this); + local->need_full_crawl = afr_need_full_heal( + this, locked_replies, source, healed_sinks, AFR_ENTRY_TRANSACTION); + } +unlock: + afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, data_lock, + NULL); + if (ret < 0) + goto out; + + if (!did_sh) + goto out; + + ret = afr_selfheal_entry_do(frame, this, fd, source, sources, healed_sinks); + if (ret) + goto out; + + /* Take entrylks in xlator domain before doing post-op (undo-pending) in + * entry self-heal. This is to prevent a parallel name self-heal on + * an entry under @fd->inode from reading pending xattrs while it is + * being modified by SHD after entry sh below, given that + * name self-heal takes locks ONLY in xlator domain and is free to read + * pending changelog in the absence of the following locking. + */ + ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL, + postop_lock); + { + if (AFR_CMP(data_lock, postop_lock, priv->child_count) != 0) { + gf_msg_debug(this->name, 0, + "%s: Skipping " + "post-op after entry self-heal as %d " + "sub-volumes, as opposed to %d, " + "could be locked in %s domain", + uuid_utoa(fd->inode->gfid), ret, + AFR_COUNT(data_lock, priv->child_count), this->name); + ret = -ENOTCONN; + goto postop_unlock; } - return 0; + afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks, + locked_replies); + ret = afr_selfheal_undo_pending( + frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending, + AFR_ENTRY_TRANSACTION, locked_replies, postop_lock); + } +postop_unlock: + afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, + postop_lock, NULL); +out: + if (did_sh) + afr_log_selfheal(fd->inode->gfid, this, ret, "entry", source, sources, + healed_sinks); + else + ret = 1; + + if (locked_replies) + afr_replies_wipe(locked_replies, priv->child_count); + return ret; } -int -afr_sh_entry_lock (call_frame_t *frame, xlator_t *this) +static fd_t * +afr_selfheal_data_opendir(xlator_t *this, inode_t *inode) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_ENTRY_SELF_HEAL_LK; - - afr_set_lock_number (frame, this); - - int_lock->lk_basename = NULL; - int_lock->lk_loc = &local->loc; - int_lock->lock_cbk = afr_sh_post_nonblocking_entry_cbk; - - afr_nonblocking_entrylk (frame, this); - - - return 0; + loc_t loc = { + 0, + }; + int ret = 0; + fd_t *fd = NULL; + + fd = fd_create(inode, 0); + if (!fd) + return NULL; + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + ret = syncop_opendir(this, &loc, fd, NULL, NULL); + if (ret) { + fd_unref(fd); + fd = NULL; + } else { + fd_bind(fd); + } + + loc_wipe(&loc); + return fd; } - int -afr_self_heal_entry (call_frame_t *frame, xlator_t *this) +afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + fd_t *fd = NULL; + int ret = 0; + + priv = this->private; + + fd = afr_selfheal_data_opendir(this, inode); + if (!fd) + return -EIO; + + locked_on = alloca0(priv->child_count); + + ret = afr_selfheal_tie_breaker_entrylk(frame, this, inode, priv->sh_domain, + NULL, locked_on); + { + if (ret < priv->child_count) { + gf_msg_debug(this->name, 0, + "%s: Skipping " + "entry self-heal as only %d sub-volumes could " + "be locked in %s domain", + uuid_utoa(fd->inode->gfid), ret, priv->sh_domain); + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; + } - priv = this->private; - local = frame->local; + ret = __afr_selfheal_entry(frame, this, fd, locked_on); + } +unlock: + afr_selfheal_unentrylk(frame, this, inode, priv->sh_domain, NULL, locked_on, + NULL); - if (local->self_heal.need_entry_self_heal && priv->entry_self_heal) { - afr_sh_entry_lock (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to completion on %s", - local->loc.path); - afr_sh_entry_done (frame, this); - } + if (fd) + fd_unref(fd); - return 0; + return ret; } - diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index e76d58850cd..03f43bad16e 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -1,731 +1,546 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ -#include "glusterfs.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" +#include <glusterfs/byte-order.h> +#include "protocol-common.h" +#include <glusterfs/events.h> +#define AFR_HEAL_ATTR (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE) -int -afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) +static gf_boolean_t +_afr_ignorable_key_match(dict_t *d, char *k, data_t *val, void *mdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - -// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); - memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count); - memset (sh->success, 0, sizeof (int) * priv->child_count); - -/* for (i = 0; i < priv->child_count; i++) { */ -/* sh->locked_nodes[i] = 1; */ -/* } */ - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - - if (local->govinda_gOvinda) { - gf_log (this->name, GF_LOG_DEBUG, - "aborting selfheal of %s", - local->loc.path); - sh->completion_cbk (frame, this); - } else { - if (IA_ISREG (sh->type)) { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to data check on %s", - local->loc.path); - afr_self_heal_data (frame, this); - return 0; - } - - if (IA_ISDIR (sh->type)) { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to entry check on %s", - local->loc.path); - afr_self_heal_entry (frame, this); - return 0; - } - gf_log (this->name, GF_LOG_DEBUG, - "completed self heal of %s", - local->loc.path); - - sh->completion_cbk (frame, this); - } - - return 0; + return afr_is_xattr_ignorable(k); } - -int -afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +void +afr_delete_ignorable_xattrs(dict_t *xattr) { - int call_count = 0; - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_metadata_done (frame, this); - - return 0; + dict_foreach_match(xattr, _afr_ignorable_key_match, NULL, + dict_remove_foreach_fn, NULL); } int -afr_sh_inode_unlock (call_frame_t *frame, xlator_t *this) +__afr_selfheal_metadata_do(call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, unsigned char *healed_sinks, + struct afr_reply *locked_replies) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->lock_cbk = afr_sh_metadata_done; - afr_unlock (frame, this); - - return 0; -} - -int -afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) -{ - afr_sh_inode_unlock (frame, this); - - return 0; -} - - -int -afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) -{ - afr_local_t *local = NULL; - int call_count = 0; - - local = frame->local; - - LOCK (&frame->lock); - { - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); + int ret = -1; + loc_t loc = { + 0, + }; + dict_t *xattr = NULL; + dict_t *old_xattr = NULL; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, + "performing metadata selfheal on %s", uuid_utoa(inode->gfid)); + + ret = syncop_getxattr(priv->children[source], &loc, &xattr, NULL, NULL, + NULL); + if (ret < 0) { + ret = -EIO; + goto out; + } + + afr_delete_ignorable_xattrs(xattr); + + for (i = 0; i < priv->child_count; i++) { + if (old_xattr) { + dict_unref(old_xattr); + old_xattr = NULL; + } - if (call_count == 0) - afr_sh_metadata_finish (frame, this); + if (!healed_sinks[i]) + continue; + + ret = syncop_setattr(priv->children[i], &loc, + &locked_replies[source].poststat, AFR_HEAL_ATTR, + NULL, NULL, NULL, NULL); + if (ret) + healed_sinks[i] = 0; + + ret = syncop_getxattr(priv->children[i], &loc, &old_xattr, 0, NULL, + NULL); + if (old_xattr) { + afr_delete_ignorable_xattrs(old_xattr); + ret = syncop_removexattr(priv->children[i], &loc, "", old_xattr, + NULL); + if (ret) + healed_sinks[i] = 0; + } - return 0; -} + ret = syncop_setxattr(priv->children[i], &loc, xattr, 0, NULL, NULL); + if (ret) + healed_sinks[i] = 0; + } + ret = 0; +out: + loc_wipe(&loc); + if (xattr) + dict_unref(xattr); + if (old_xattr) + dict_unref(old_xattr); -int -afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, - sh->success, priv->child_count, - AFR_METADATA_TRANSACTION); - - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } - } - - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_METADATA_TRANSACTION); - - local->call_count = call_count; - - if (call_count == 0) { - gf_log (this->name, GF_LOG_WARNING, - "metadata of %s not healed on any subvolume", - local->loc.path); - - afr_sh_metadata_finish (frame, this); - } - - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } - - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - GF_FREE (erase_xattr); - - return 0; + return ret; } - -int -afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +static uint64_t +mtime_ns(struct iatt *ia) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "setting attributes failed for %s on %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->success[child_index] = 0; - } - } - UNLOCK (&frame->lock); + uint64_t ret; - call_count = afr_frame_return (frame); + ret = (((uint64_t)(ia->ia_mtime)) * 1000000000) + + (uint64_t)(ia->ia_mtime_nsec); - if (call_count == 0) - afr_sh_metadata_erase_pending (frame, this); - - return 0; + return ret; } - -int -afr_sh_metadata_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) +/* + * When directory content is modified, [mc]time is updated. On + * Linux, the filesystem does it, while at least on NetBSD, the + * kernel file-system independent code does it. This means that + * when entries are added while bricks are down, the kernel sends + * a SETATTR [mc]time which will cause metadata split brain for + * the directory. In this case, clear the split brain by finding + * the source with the most recent modification date. + */ +static int +afr_dirtime_splitbrain_source(call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + unsigned char *locked_on) { - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); - - return 0; + afr_private_t *priv = NULL; + int source = -1; + struct iatt source_ia; + struct iatt child_ia; + uint64_t mtime = 0; + int i; + int ret = -1; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!locked_on[i]) + continue; + + if (!replies[i].valid) + continue; + + if (replies[i].op_ret != 0) + continue; + + if (mtime_ns(&replies[i].poststat) <= mtime) + continue; + + mtime = mtime_ns(&replies[i].poststat); + source = i; + } + + if (source == -1) + goto out; + + source_ia = replies[source].poststat; + if (source_ia.ia_type != IA_IFDIR) + goto out; + + for (i = 0; i < priv->child_count; i++) { + if (i == source) + continue; + + if (!replies[i].valid) + continue; + + if (replies[i].op_ret != 0) + continue; + + child_ia = replies[i].poststat; + + if (!IA_EQUAL(source_ia, child_ia, gfid) || + !IA_EQUAL(source_ia, child_ia, type) || + !IA_EQUAL(source_ia, child_ia, prot) || + !IA_EQUAL(source_ia, child_ia, uid) || + !IA_EQUAL(source_ia, child_ia, gid) || + !afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata)) + goto out; + } + + /* + * Metadata split brain is just about [amc]time + * We return our source. + */ + ret = source; +out: + return ret; } - -int -afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +static int +__afr_selfheal_metadata_mark_pending_xattrs(call_frame_t *frame, xlator_t *this, + inode_t *inode, + struct afr_reply *replies, + unsigned char *sources) { - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); - - return 0; -} + int ret = 0; + int i = 0; + int m_idx = 0; + afr_private_t *priv = NULL; + int raw[AFR_NUM_CHANGE_LOGS] = {0}; + dict_t *xattr = NULL; + + priv = this->private; + m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION); + raw[m_idx] = 1; + + xattr = dict_new(); + if (!xattr) + return -ENOMEM; + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) + continue; + ret = dict_set_static_bin(xattr, priv->pending_key[i], raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + ret = -1; + goto out; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + ret = afr_selfheal_post_op(frame, this, inode, i, xattr, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_SELF_HEAL_INFO, + "Failed to set pending metadata xattr on child %d for %s", i, + uuid_utoa(inode->gfid)); + goto out; + } + } + afr_replies_wipe(replies, priv->child_count); + ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); -int -afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - int active_sinks = 0; - int call_count = 0; - int i = 0; - - struct iatt stbuf; - int32_t valid = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - active_sinks = sh->active_sinks; - - /* - * 2 calls per sink - setattr, setxattr - */ - if (xattr) - call_count = active_sinks * 2; - else - call_count = active_sinks; - - local->call_count = call_count; - - stbuf.ia_atime = sh->buf[source].ia_atime; - stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; - stbuf.ia_mtime = sh->buf[source].ia_mtime; - stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; - - stbuf.ia_uid = sh->buf[source].ia_uid; - stbuf.ia_gid = sh->buf[source].ia_gid; - - stbuf.ia_type = sh->buf[source].ia_type; - stbuf.ia_prot = sh->buf[source].ia_prot; - - valid = GF_SET_ATTR_MODE | - GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - - for (i = 0; i < priv->child_count; i++) { - if (call_count == 0) { - break; - } - if (sh->sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_DEBUG, - "self-healing metadata of %s from %s to %s", - local->loc.path, priv->children[source]->name, - priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_setattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, &stbuf, valid); - - call_count--; - - if (!xattr) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, xattr, 0); - call_count--; - } - - return 0; +out: + if (xattr) + dict_unref(xattr); + return ret; } - -int -afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) +/* + * Look for mismatching uid/gid or mode or user xattrs even if + * AFR xattrs don't say so, and pick one arbitrarily as winner. */ + +static int +__afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *undid_pending, + unsigned char *locked_on, + struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - - int i; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "getxattr of %s failed on subvolume %s (%s). proceeding without xattr", - local->loc.path, priv->children[source]->name, - strerror (op_errno)); - - afr_sh_metadata_sync (frame, this, NULL); - } else { - for (i = 0; i < priv->child_count; i++) { - dict_del (xattr, priv->pending_key[i]); - } - - afr_sh_metadata_sync (frame, this, xattr); - } + int i = 0; + afr_private_t *priv = NULL; + struct iatt srcstat = { + 0, + }; + int source = -1; + int sources_count = 0; + int ret = 0; + + priv = this->private; + + sources_count = AFR_COUNT(sources, priv->child_count); + + if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) || + !sources_count) { + source = afr_mark_split_brain_source_sinks( + frame, this, inode, sources, sinks, healed_sinks, locked_on, + replies, AFR_METADATA_TRANSACTION); + if (source >= 0) { + _afr_fav_child_reset_sink_xattrs( + frame, this, inode, source, healed_sinks, undid_pending, + AFR_METADATA_TRANSACTION, locked_on, replies); + goto out; + } - return 0; -} + /* If this is a directory mtime/ctime only split brain + use the most recent */ + source = afr_dirtime_splitbrain_source(frame, this, replies, locked_on); + if (source != -1) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SPLIT_BRAIN, + "clear time " + "split brain on %s", + uuid_utoa(replies[source].poststat.ia_gfid)); + sources[source] = 1; + healed_sinks[source] = 0; + goto out; + } + if (!priv->metadata_splitbrain_forced_heal) { + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;" + "type=metadata;file=%s", + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(inode->gfid)); + return -EIO; + } -int -afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int active_sinks = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; - } - } - sh->success[source] = 1; - - if (active_sinks == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "no active sinks for performing self-heal on file %s", - local->loc.path); - afr_sh_metadata_finish (frame, this); - return 0; - } - sh->active_sinks = active_sinks; - - gf_log (this->name, GF_LOG_TRACE, - "syncing metadata of %s from subvolume %s to %d active sinks", - local->loc.path, priv->children[source]->name, active_sinks); - - STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, - priv->children[source], - priv->children[source]->fops->getxattr, - &local->loc, NULL); - - return 0; + /* Metadata split brain, select one subvol + arbitrarily */ + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i] && healed_sinks[i]) { + sources[i] = 1; + healed_sinks[i] = 0; + break; + } + } + } + + /* No split brain at this point. If we were called from + * afr_heal_splitbrain_file(), abort.*/ + if (afr_dict_contains_heal_op(frame)) + return -EIO; + + source = afr_choose_source_by_policy(priv, sources, + AFR_METADATA_TRANSACTION); + srcstat = replies[source].poststat; + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i] || i == source) + continue; + if (!IA_EQUAL(srcstat, replies[i].poststat, type) || + !IA_EQUAL(srcstat, replies[i].poststat, uid) || + !IA_EQUAL(srcstat, replies[i].poststat, gid) || + !IA_EQUAL(srcstat, replies[i].poststat, prot)) { + gf_msg_debug(this->name, 0, + "%s: iatt mismatch " + "for source(%d) vs (%d)", + uuid_utoa(replies[source].poststat.ia_gfid), source, + i); + sources[i] = 0; + healed_sinks[i] = 1; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i] || i == source) + continue; + if (!afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata)) { + gf_msg_debug(this->name, 0, + "%s: xattr mismatch " + "for source(%d) vs (%d)", + uuid_utoa(replies[source].poststat.ia_gfid), source, + i); + sources[i] = 0; + healed_sinks[i] = 1; + } + } + if ((sources_count == priv->child_count) && (source > -1) && + (AFR_COUNT(healed_sinks, priv->child_count) != 0)) { + ret = __afr_selfheal_metadata_mark_pending_xattrs(frame, this, inode, + replies, sources); + if (ret < 0) + return ret; + } +out: + afr_mark_active_sinks(this, sources, locked_on, healed_sinks); + return source; } - int -afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) +__afr_selfheal_metadata_prepare(call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *undid_pending, + struct afr_reply *replies, unsigned char *pflag) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr, - priv->child_count, - AFR_METADATA_TRANSACTION); - - afr_sh_print_pending_matrix (sh->pending_matrix, this); - - nsources = afr_sh_mark_sources (sh, priv->child_count, - AFR_SELF_HEAL_METADATA); - - afr_sh_supress_errenous_children (sh->sources, sh->child_errno, - priv->child_count); - - if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_metadata_finish (frame, this); - return 0; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; + uint64_t *witness = NULL; + + priv = this->private; + + ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); + if (ret) + return ret; + + witness = alloca0(sizeof(*witness) * priv->child_count); + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_METADATA_TRANSACTION, locked_on, + sources, sinks, witness, pflag); + if (ret) + return ret; + + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count); + + /* If any source has witness, pick first + * witness source and make everybody else sinks */ + for (i = 0; i < priv->child_count; i++) { + if (sources[i] && witness[i]) { + source = i; + break; } - - if ((nsources == -1) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { - - gf_log (this->name, GF_LOG_WARNING, - "Picking favorite child %s as authentic source to resolve conflicting metadata of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); - - sh->sources[priv->favorite_child] = 1; - - nsources = afr_sh_source_count (sh->sources, - priv->child_count); - } - - if (nsources == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to self-heal permissions/ownership of '%s' " - "(possible split-brain). Please fix the file on " - "all backend volumes", local->loc.path); - - local->govinda_gOvinda = 1; - - afr_sh_metadata_finish (frame, this); - return 0; - } - - source = afr_sh_select_source (sh->sources, priv->child_count); - - if (source == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "No active sources found."); - - afr_sh_metadata_finish (frame, this); - return 0; + } + + if (source != -1) { + for (i = 0; i < priv->child_count; i++) { + if (i != source && sources[i]) { + sources[i] = 0; + healed_sinks[i] = 1; + } } + } - sh->source = source; - - /* detect changes not visible through pending flags -- JIC */ - for (i = 0; i < priv->child_count; i++) { - if (i == source || sh->child_errno[i]) - continue; + source = __afr_selfheal_metadata_finalize_source( + frame, this, inode, sources, sinks, healed_sinks, undid_pending, + locked_on, replies); - if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; + if (source < 0) + return -EIO; - if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - } - - afr_sh_metadata_sync_prepare (frame, this); - - return 0; -} - - -int -afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "path %s on subvolume %s is of mode 0%o", - local->loc.path, - priv->children[child_index]->name, - buf->ia_type); - - sh->buf[child_index] = *buf; - if (xattr) - sh->xattr[child_index] = dict_ref (xattr); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "path %s on subvolume %s => -1 (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->child_errno[child_index] = op_errno; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_metadata_fix (frame, this); - - return 0; + return source; } - int -afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this) +afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = 0; - dict_t *xattr_req = NULL; - int ret = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, - local->child_up); - local->call_count = call_count; - - xattr_req = dict_new(); - - if (xattr_req) { - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (xattr_req, - priv->pending_key[i], - 3 * sizeof(int32_t)); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - } + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *undid_pending = NULL; + struct afr_reply *locked_replies = NULL; + gf_boolean_t did_sh = _gf_true; + int source = -1; + + priv = this->private; + + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + healed_sinks = alloca0(priv->child_count); + undid_pending = alloca0(priv->child_count); + data_lock = alloca0(priv->child_count); + + locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count); + + ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0, + data_lock); + { + if (ret < priv->child_count) { + ret = -ENOTCONN; + goto unlock; } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &local->loc, xattr_req); - if (!--call_count) - break; - } - } - - if (xattr_req) - dict_unref (xattr_req); - - return 0; -} - -int -afr_sh_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + ret = __afr_selfheal_metadata_prepare( + frame, this, inode, data_lock, sources, sinks, healed_sinks, + undid_pending, locked_replies, NULL); + if (ret < 0) + goto unlock; - local = frame->local; - int_lock = &local->internal_lock; + source = ret; - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Non Blocking inodelks failed."); - afr_sh_metadata_done (frame, this); - } else { - - gf_log (this->name, GF_LOG_DEBUG, - "Non Blocking inodelks done. Proceeding to FOP"); - afr_sh_metadata_lookup (frame, this); + if (AFR_COUNT(healed_sinks, priv->child_count) == 0) { + did_sh = _gf_false; + goto unlock; } - return 0; + ret = __afr_selfheal_metadata_do(frame, this, inode, source, + healed_sinks, locked_replies); + if (ret) + goto unlock; + + afr_selfheal_restore_time(frame, this, inode, source, healed_sinks, + locked_replies); + + ret = afr_selfheal_undo_pending( + frame, this, inode, sources, sinks, healed_sinks, undid_pending, + AFR_METADATA_TRANSACTION, locked_replies, data_lock); + } +unlock: + afr_selfheal_uninodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0, + data_lock); + + if (did_sh) + afr_log_selfheal(inode->gfid, this, ret, "metadata", source, sources, + healed_sinks); + else + ret = 1; + + if (locked_replies) + afr_replies_wipe(locked_replies, priv->child_count); + return ret; } int -afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) +afr_selfheal_metadata_by_stbuf(xlator_t *this, struct iatt *stbuf) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK; - - afr_set_lock_number (frame, this); - - int_lock->lk_flock.l_start = 0; - int_lock->lk_flock.l_len = 0; - int_lock->lk_flock.l_type = F_WRLCK; - int_lock->lock_cbk = afr_sh_post_nonblocking_inodelk_cbk; - - afr_nonblocking_inodelk (frame, this); - - return 0; -} - - -int -afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = this->private; - - - local = frame->local; - - if (local->self_heal.need_metadata_self_heal && priv->metadata_self_heal) { - afr_sh_metadata_lock (frame, this); - } else { - afr_sh_metadata_done (frame, this); - } - - return 0; + inode_t *inode = NULL; + inode_t *link_inode = NULL; + call_frame_t *frame = NULL; + int ret = 0; + + if (gf_uuid_is_null(stbuf->ia_gfid)) { + ret = -EINVAL; + goto out; + } + + inode = inode_new(this->itable); + if (!inode) { + ret = -ENOMEM; + goto out; + } + + link_inode = inode_link(inode, NULL, NULL, stbuf); + if (!link_inode) { + ret = -ENOMEM; + goto out; + } + + frame = afr_frame_create(this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + + ret = afr_selfheal_metadata(frame, this, link_inode); +out: + if (inode) + inode_unref(inode); + if (link_inode) + inode_unref(link_inode); + if (frame) + AFR_STACK_DESTROY(frame); + return ret; } - diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c new file mode 100644 index 00000000000..834aac86d48 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -0,0 +1,616 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <glusterfs/events.h> +#include "afr.h" +#include "afr-self-heal.h" +#include "afr-messages.h" + +int +__afr_selfheal_assign_gfid(xlator_t *this, inode_t *parent, uuid_t pargfid, + const char *bname, inode_t *inode, + struct afr_reply *replies, void *gfid, + unsigned char *locked_on, int source, + unsigned char *sources, gf_boolean_t is_gfid_absent, + int *gfid_idx) +{ + int ret = 0; + int up_count = 0; + int locked_count = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + gf_uuid_copy(parent->gfid, pargfid); + + if (is_gfid_absent) { + /* Ensure all children of AFR are up before performing gfid heal, to + * guard against the possibility of gfid split brain. */ + + up_count = AFR_COUNT(priv->child_up, priv->child_count); + if (up_count != priv->child_count) { + ret = -EIO; + goto out; + } + + locked_count = AFR_COUNT(locked_on, priv->child_count); + if (locked_count != priv->child_count) { + ret = -EIO; + goto out; + } + } + + ret = afr_lookup_and_heal_gfid(this, parent, bname, inode, replies, source, + sources, gfid, gfid_idx); + +out: + return ret; +} + +int +__afr_selfheal_name_impunge(call_frame_t *frame, xlator_t *this, + inode_t *parent, uuid_t pargfid, const char *bname, + inode_t *inode, struct afr_reply *replies, + int gfid_idx) +{ + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; + unsigned char *sources = NULL; + + priv = this->private; + + sources = alloca0(priv->child_count); + + gf_uuid_copy(parent->gfid, pargfid); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; + + if (gf_uuid_compare(replies[i].poststat.ia_gfid, + replies[gfid_idx].poststat.ia_gfid) == 0) { + sources[i] = 1; + continue; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) + continue; + + ret |= afr_selfheal_recreate_entry(frame, i, gfid_idx, sources, parent, + bname, inode, replies); + } + + return ret; +} + +int +__afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid, + const char *bname, inode_t *inode, + struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret) + continue; + + ret |= afr_selfheal_entry_delete(this, parent, bname, inode, i, + replies); + } + + return ret; +} + +static gf_boolean_t +afr_selfheal_name_need_heal_check(xlator_t *this, struct afr_reply *replies) +{ + int i = 0; + int first_idx = -1; + gf_boolean_t need_heal = _gf_false; + afr_private_t *priv = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA)) + need_heal = _gf_true; + + if (first_idx == -1) { + first_idx = i; + continue; + } + + if (replies[i].op_ret != replies[first_idx].op_ret) + need_heal = _gf_true; + + if (gf_uuid_compare(replies[i].poststat.ia_gfid, + replies[first_idx].poststat.ia_gfid)) + need_heal = _gf_true; + + if ((replies[i].op_ret == 0) && + (gf_uuid_is_null(replies[i].poststat.ia_gfid))) + need_heal = _gf_true; + } + + return need_heal; +} + +static int +afr_selfheal_name_type_mismatch_check(xlator_t *this, struct afr_reply *replies, + int source, unsigned char *sources, + uuid_t pargfid, const char *bname) +{ + int i = 0; + int type_idx = -1; + ia_type_t inode_type = IA_INVAL; + ia_type_t inode_type1 = IA_INVAL; + afr_private_t *priv = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; + + if (replies[i].poststat.ia_type == IA_INVAL) + continue; + + if (inode_type == IA_INVAL) { + inode_type = replies[i].poststat.ia_type; + type_idx = i; + continue; + } + inode_type1 = replies[i].poststat.ia_type; + if (sources[i] || source == -1) { + if ((sources[type_idx] || source == -1) && + (inode_type != inode_type1)) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN, + "Type mismatch for <gfid:%s>/%s: " + "%s on %s and %s on %s", + uuid_utoa(pargfid), bname, + gf_inode_type_to_str(inode_type1), + priv->children[i]->name, + gf_inode_type_to_str(inode_type), + priv->children[type_idx]->name); + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;type=file;" + "file=<gfid:%s>/%s;count=2;" + "child-%d=%s;type-%d=%s;child-%d=%s;" + "type-%d=%s", + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(pargfid), bname, i, priv->children[i]->name, + i, gf_inode_type_to_str(inode_type1), type_idx, + priv->children[type_idx]->name, type_idx, + gf_inode_type_to_str(inode_type)); + return -EIO; + } + inode_type = replies[i].poststat.ia_type; + type_idx = i; + } + } + return 0; +} + +static int +afr_selfheal_name_gfid_mismatch_check(xlator_t *this, struct afr_reply *replies, + int source, unsigned char *sources, + int *gfid_idx, uuid_t pargfid, + const char *bname, inode_t *inode, + unsigned char *locked_on, dict_t *xdata) +{ + int i = 0; + int gfid_idx_iter = -1; + int ret = -1; + void *gfid = NULL; + void *gfid1 = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; + + if (gf_uuid_is_null(replies[i].poststat.ia_gfid)) + continue; + + if (!gfid) { + gfid = &replies[i].poststat.ia_gfid; + gfid_idx_iter = i; + continue; + } + + gfid1 = &replies[i].poststat.ia_gfid; + if (sources[i] || source == -1) { + if ((sources[gfid_idx_iter] || source == -1) && + gf_uuid_compare(gfid, gfid1)) { + ret = afr_gfid_split_brain_source(this, replies, inode, pargfid, + bname, gfid_idx_iter, i, + locked_on, gfid_idx, xdata); + if (!ret && *gfid_idx >= 0) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + "GFID split-brain resolved"); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_DICT_SET_FAILED, + "Error setting gfid-" + "heal-msg dict"); + } + return ret; + } + gfid = &replies[i].poststat.ia_gfid; + gfid_idx_iter = i; + } + } + + *gfid_idx = gfid_idx_iter; + return 0; +} + +static gf_boolean_t +afr_selfheal_name_source_empty_check(xlator_t *this, struct afr_reply *replies, + unsigned char *sources, int source) +{ + int i = 0; + afr_private_t *priv = NULL; + gf_boolean_t source_is_empty = _gf_true; + + priv = this->private; + + if (source == -1) { + source_is_empty = _gf_false; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + + if (replies[i].op_ret == -1 && replies[i].op_errno == ENOENT) + continue; + + source_is_empty = _gf_false; + break; + } +out: + return source_is_empty; +} + +int +__afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, int source, + unsigned char *locked_on, struct afr_reply *replies, + void *gfid_req, dict_t *xdata) +{ + int gfid_idx = -1; + int ret = -1; + void *gfid = NULL; + gf_boolean_t source_is_empty = _gf_true; + gf_boolean_t need_heal = _gf_false; + gf_boolean_t is_gfid_absent = _gf_false; + + need_heal = afr_selfheal_name_need_heal_check(this, replies); + if (!need_heal) + return 0; + + source_is_empty = afr_selfheal_name_source_empty_check(this, replies, + sources, source); + if (source_is_empty) { + ret = __afr_selfheal_name_expunge(this, parent, pargfid, bname, inode, + replies); + if (ret == -EIO) + ret = -1; + return ret; + } + + ret = afr_selfheal_name_type_mismatch_check(this, replies, source, sources, + pargfid, bname); + if (ret) + return ret; + + ret = afr_selfheal_name_gfid_mismatch_check(this, replies, source, sources, + &gfid_idx, pargfid, bname, + inode, locked_on, xdata); + if (ret) + return ret; + + if (gfid_idx == -1) { + if (!gfid_req || gf_uuid_is_null(gfid_req)) + return -1; + gfid = gfid_req; + } else { + gfid = &replies[gfid_idx].poststat.ia_gfid; + if (source == -1) + /* Either entry split-brain or dirty xattrs are present on parent.*/ + source = gfid_idx; + } + + is_gfid_absent = (gfid_idx == -1) ? _gf_true : _gf_false; + ret = __afr_selfheal_assign_gfid(this, parent, pargfid, bname, inode, + replies, gfid, locked_on, source, sources, + is_gfid_absent, &gfid_idx); + if (ret || (gfid_idx < 0)) + return ret; + + ret = __afr_selfheal_name_impunge(frame, this, parent, pargfid, bname, + inode, replies, gfid_idx); + if (ret == -EIO) + ret = -1; + + return ret; +} + +int +__afr_selfheal_name_finalize_source(xlator_t *this, unsigned char *sources, + unsigned char *healed_sinks, + unsigned char *locked_on, uint64_t *witness) +{ + int i = 0; + afr_private_t *priv = NULL; + int source = -1; + int sources_count = 0; + + priv = this->private; + + sources_count = AFR_COUNT(sources, priv->child_count); + + if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) || + !sources_count || afr_does_witness_exist(this, witness)) { + memset(sources, 0, sizeof(*sources) * priv->child_count); + afr_mark_active_sinks(this, sources, locked_on, healed_sinks); + return -1; + } + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + break; + } + } + + return source; +} + +int +__afr_selfheal_name_prepare(call_frame_t *frame, xlator_t *this, + inode_t *parent, uuid_t pargfid, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + int *source_p) +{ + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + uint64_t *witness = NULL; + + priv = this->private; + + replies = alloca0(priv->child_count * sizeof(*replies)); + + ret = afr_selfheal_unlocked_discover(frame, parent, pargfid, replies); + if (ret) + goto out; + + witness = alloca0(sizeof(*witness) * priv->child_count); + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_ENTRY_TRANSACTION, locked_on, sources, + sinks, witness, NULL); + if (ret) + goto out; + + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count); + + source = __afr_selfheal_name_finalize_source(this, sources, healed_sinks, + locked_on, witness); + if (source < 0) { + /* If source is < 0 (typically split-brain), we perform a + conservative merge of entries rather than erroring out */ + } + *source_p = source; + +out: + if (replies) + afr_replies_wipe(replies, priv->child_count); + + return ret; +} + +int +afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, void *gfid_req, + dict_t *xdata) +{ + afr_private_t *priv = NULL; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *locked_on = NULL; + int source = -1; + struct afr_reply *replies = NULL; + int ret = -1; + inode_t *inode = NULL; + dict_t *xattr = NULL; + + xattr = dict_new(); + if (!xattr) + return -ENOMEM; + + ret = dict_set_int32_sizen(xattr, GF_GFIDLESS_LOOKUP, 1); + if (ret) { + dict_unref(xattr); + return -1; + } + + priv = this->private; + + locked_on = alloca0(priv->child_count); + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + healed_sinks = alloca0(priv->child_count); + + replies = alloca0(priv->child_count * sizeof(*replies)); + + ret = afr_selfheal_entrylk(frame, this, parent, this->name, bname, + locked_on); + { + if (ret < priv->child_count) { + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_name_prepare(frame, this, parent, pargfid, + locked_on, sources, sinks, + healed_sinks, &source); + if (ret) + goto unlock; + + inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies, + locked_on, xattr); + if (!inode) { + ret = -ENOMEM; + goto unlock; + } + + ret = __afr_selfheal_name_do(frame, this, parent, pargfid, bname, inode, + sources, sinks, healed_sinks, source, + locked_on, replies, gfid_req, xdata); + } +unlock: + afr_selfheal_unentrylk(frame, this, parent, this->name, bname, locked_on, + NULL); + if (inode) + inode_unref(inode); + + if (replies) + afr_replies_wipe(replies, priv->child_count); + if (xattr) + dict_unref(xattr); + + return ret; +} + +int +afr_selfheal_name_unlocked_inspect(call_frame_t *frame, xlator_t *this, + inode_t *parent, uuid_t pargfid, + const char *bname, gf_boolean_t *need_heal) +{ + afr_private_t *priv = NULL; + int i = 0; + struct afr_reply *replies = NULL; + inode_t *inode = NULL; + int first_idx = -1; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + replies = alloca0(sizeof(*replies) * priv->child_count); + + inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies, + local->child_up, NULL); + if (!inode) + return -ENOMEM; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA)) { + *need_heal = _gf_true; + break; + } + + if (first_idx == -1) { + first_idx = i; + continue; + } + + if (replies[i].op_ret != replies[first_idx].op_ret) { + *need_heal = _gf_true; + break; + } + + if (gf_uuid_compare(replies[i].poststat.ia_gfid, + replies[first_idx].poststat.ia_gfid)) { + *need_heal = _gf_true; + break; + } + } + + if (inode) + inode_unref(inode); + if (replies) + afr_replies_wipe(replies, priv->child_count); + return 0; +} + +int +afr_selfheal_name(xlator_t *this, uuid_t pargfid, const char *bname, + void *gfid_req, dict_t *xdata) +{ + inode_t *parent = NULL; + call_frame_t *frame = NULL; + int ret = -1; + gf_boolean_t need_heal = _gf_false; + + parent = afr_inode_find(this, pargfid); + if (!parent) + goto out; + + frame = afr_frame_create(this, NULL); + if (!frame) + goto out; + + ret = afr_selfheal_name_unlocked_inspect(frame, this, parent, pargfid, + bname, &need_heal); + if (ret) + goto out; + + if (need_heal) { + ret = afr_selfheal_name_do(frame, this, parent, pargfid, bname, + gfid_req, xdata); + if (ret) + goto out; + } + + ret = 0; +out: + if (parent) + inode_unref(parent); + if (frame) + AFR_STACK_DESTROY(frame); + + return ret; +} diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index b10ae3fc037..48e6dbcfb18 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -1,54 +1,377 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#ifndef __AFR_SELF_HEAL_H__ -#define __AFR_SELF_HEAL_H__ +#ifndef _AFR_SELFHEAL_H +#define _AFR_SELFHEAL_H + +/* Perform fop on all UP subvolumes and wait for all callbacks to return */ + +#define AFR_ONALL(frame, rfn, fop, args...) \ + do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0, __count = 0; \ + unsigned char *__child_up = alloca(__priv->child_count); \ + \ + memcpy(__child_up, __priv->child_up, \ + sizeof(*__child_up) * __priv->child_count); \ + __count = AFR_COUNT(__child_up, __priv->child_count); \ + \ + __local->barrier.waitfor = __count; \ + afr_local_replies_wipe(__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!__child_up[__i]) \ + continue; \ + STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + } \ + syncbarrier_wait(&__local->barrier, __count); \ + } while (0) + +/* Perform fop on all subvolumes represented by list[] array and wait + for all callbacks to return */ + +#define AFR_ONLIST(list, frame, rfn, fop, args...) \ + do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0; \ + int __count = 0; \ + unsigned char *__list = alloca(__priv->child_count); \ + \ + memcpy(__list, list, sizeof(*__list) * __priv->child_count); \ + __count = AFR_COUNT(__list, __priv->child_count); \ + __local->barrier.waitfor = __count; \ + afr_local_replies_wipe(__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!__list[__i]) \ + continue; \ + STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + } \ + syncbarrier_wait(&__local->barrier, __count); \ + } while (0) + +#define AFR_SEQ(frame, rfn, fop, args...) \ + do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0; \ + \ + afr_local_replies_wipe(__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!__priv->child_up[__i]) \ + continue; \ + STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + syncbarrier_wait(&__local->barrier, 1); \ + } \ + } while (0) + +#define ALLOC_MATRIX(n, type) \ + ({ \ + int __i; \ + type **__ptr = alloca(n * sizeof(type *)); \ + \ + for (__i = 0; __i < n; __i++) \ + __ptr[__i] = alloca0(n * sizeof(type)); \ + __ptr; \ + }) + +#define IA_EQUAL(f, s, field) \ + (memcmp(&(f.ia_##field), &(s.ia_##field), sizeof(s.ia_##field)) == 0) + +#define SBRAIN_HEAL_NO_GO_MSG \ + "Failed to obtain replies from all bricks of " \ + "the replica (are they up?). Cannot resolve split-brain." +#define SFILE_NOT_IN_SPLIT_BRAIN "File not in split-brain" +#define SNO_BIGGER_FILE "No bigger file" +#define SNO_DIFF_IN_MTIME "No difference in mtime" +#define SUSE_SOURCE_BRICK_TO_HEAL \ + "Use source-brick option to heal metadata" \ + " split-brain" +#define SINVALID_BRICK_NAME "Invalid brick name" +#define SBRICK_IS_NOT_UP "Brick is not up" +#define SBRICK_NOT_CONNECTED "Brick is not connected" +#define SLESS_THAN2_BRICKS_in_REP "< 2 bricks in replica are up" +#define SBRICK_IS_REMOTE "Brick is remote" +#define SSTARTED_SELF_HEAL "Started self-heal" +#define SOP_NOT_SUPPORTED "Operation Not Supported" +#define SFILE_NOT_UNDER_DATA \ + "The file is not under data or metadata " \ + "split-brain" +#define SFILE_NOT_IN_SPLIT_BRAIN "File not in split-brain" +#define SALL_BRICKS_UP_TO_RESOLVE \ + "All the bricks should be up to resolve the" \ + " gfid split brain" +#define SERROR_GETTING_SRC_BRICK "Error getting the source brick" +int +afr_selfheal(xlator_t *this, uuid_t gfid); + +gf_boolean_t +afr_throttled_selfheal(call_frame_t *frame, xlator_t *this); + +int +afr_selfheal_name(xlator_t *this, uuid_t gfid, const char *name, void *gfid_req, + dict_t *xdata); + +int +afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd); + +int +afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode); + +int +afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode); + +int +afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name, + inode_t *inode, struct afr_reply *replies, int source, + unsigned char *sources, void *gfid, int *gfid_idx); + +int +afr_selfheal_inodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on); + +int +afr_selfheal_tryinodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on); + +int +afr_selfheal_tie_breaker_inodelk(call_frame_t *frame, xlator_t *this, + inode_t *inode, char *dom, off_t off, + size_t size, unsigned char *locked_on); + +int +afr_selfheal_uninodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + const unsigned char *locked_on); + +int +afr_selfheal_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on); + +int +afr_selfheal_tryentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on); + +int +afr_selfheal_tie_breaker_entrylk(call_frame_t *frame, xlator_t *this, + inode_t *inode, char *dom, const char *name, + unsigned char *locked_on); + +int +afr_selfheal_unentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on, + dict_t *xdata); + +int +afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid, + struct afr_reply *replies); + +int +afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies, + unsigned char *discover_on, dict_t *dict); +inode_t * +afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, + unsigned char *lookup_on, dict_t *xattr); + +int +afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + uint64_t *witness, unsigned char *flag); +int +afr_selfheal_fill_matrix(xlator_t *this, int **matrix, int subvol, int idx, + dict_t *xdata); + +int +afr_selfheal_extract_xattr(xlator_t *this, struct afr_reply *replies, + afr_transaction_type type, int *dirty, int **matrix); + +int +afr_sh_generic_fop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata); + +int +afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, unsigned char *healed_sinks, + struct afr_reply *replies); +int +afr_selfheal_undo_pending(call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *undid_pending, + afr_transaction_type type, struct afr_reply *replies, + unsigned char *locked_on); + +int +afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, + unsigned char *sources, inode_t *dir, + const char *name, inode_t *inode, + struct afr_reply *replies); + +int +afr_selfheal_post_op(call_frame_t *frame, xlator_t *this, inode_t *inode, + int subvol, dict_t *xattr, dict_t *xdata); + +call_frame_t * +afr_frame_create(xlator_t *this, int32_t *op_errno); + +inode_t * +afr_inode_find(xlator_t *this, uuid_t gfid); + +int +afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *parbuf); +void +afr_reply_copy(struct afr_reply *dst, struct afr_reply *src); + +void +afr_replies_copy(struct afr_reply *dst, struct afr_reply *src, int count); + +int +afr_selfheal_newentry_mark(call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, struct afr_reply *replies, + unsigned char *sources, unsigned char *newentry); + +unsigned int +afr_success_count(struct afr_reply *replies, unsigned int count); -#include <sys/stat.h> +void +afr_log_selfheal(uuid_t gfid, xlator_t *this, int ret, char *type, int source, + unsigned char *sources, unsigned char *healed_sinks); -#define FILETYPE_DIFFERS(buf1,buf2) ((buf1)->ia_type != (buf2)->ia_type) -#define PERMISSION_DIFFERS(buf1,buf2) (st_mode_from_ia ((buf1)->ia_prot, (buf1)->ia_type) != st_mode_from_ia ((buf2)->ia_prot, (buf2)->ia_type)) -#define OWNERSHIP_DIFFERS(buf1,buf2) (((buf1)->ia_uid != (buf2)->ia_uid) || ((buf1)->ia_gid != (buf2)->ia_gid)) -#define SIZE_DIFFERS(buf1,buf2) ((buf1)->ia_size != (buf2)->ia_size) +void +afr_mark_largest_file_as_source(xlator_t *this, unsigned char *sources, + struct afr_reply *replies); +void +afr_mark_active_sinks(xlator_t *this, unsigned char *sources, + unsigned char *locked_on, unsigned char *sinks); -#define SIZE_GREATER(buf1,buf2) ((buf1)->ia_size > (buf2)->ia_size) +gf_boolean_t +afr_dict_contains_heal_op(call_frame_t *frame); +gf_boolean_t +afr_can_decide_split_brain_source_sinks(struct afr_reply *replies, + int child_count); int -afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this); +afr_mark_split_brain_source_sinks( + call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies, afr_transaction_type type); + +int +afr_sh_get_fav_by_policy(xlator_t *this, struct afr_reply *replies, + inode_t *inode, char **policy_str); + int -afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this); +_afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this, + inode_t *inode, int source, + unsigned char *healed_sinks, + unsigned char *undid_pending, + afr_transaction_type type, + unsigned char *locked_on, + struct afr_reply *replies); + int -afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this); +afr_get_child_index_from_name(xlator_t *this, char *name); + +gf_boolean_t +afr_does_witness_exist(xlator_t *this, uint64_t *witness); int -afr_self_heal_entry (call_frame_t *frame, xlator_t *this); +__afr_selfheal_data_prepare(call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + unsigned char *undid_pending, + struct afr_reply *replies, unsigned char *flag); int -afr_self_heal_data (call_frame_t *frame, xlator_t *this); +__afr_selfheal_metadata_prepare(call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *undid_pending, + struct afr_reply *replies, unsigned char *flag); +int +__afr_selfheal_entry_prepare(call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, + struct afr_reply *replies, int *source_p, + unsigned char *flag); int -afr_self_heal_metadata (call_frame_t *frame, xlator_t *this); +afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, + inode_t **link_inode, gf_boolean_t *data_selfheal, + gf_boolean_t *metadata_selfheal, + gf_boolean_t *entry_selfheal, + struct afr_reply *replies); int -afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr); +afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid); + +int +afr_selfheal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata); + +int +afr_locked_fill(call_frame_t *frame, xlator_t *this, unsigned char *locked_on); +int +afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources, + afr_transaction_type type); int -afr_self_heal (call_frame_t *frame, xlator_t *this); +afr_selfheal_metadata_by_stbuf(xlator_t *this, struct iatt *stbuf); -#endif /* __AFR_SELF_HEAL_H__ */ +int +afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode); +int +afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode); +int +afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode); + +int +afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies, + inode_t *inode, uuid_t pargfid, const char *bname, + int src_idx, int child_idx, + unsigned char *locked_on, int *src, dict_t *xdata); +int +afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies, + afr_transaction_type type); + +gf_boolean_t +afr_is_file_empty_on_all_children(afr_private_t *priv, + struct afr_reply *replies); + +int +afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, + inode_t *inode, int child, struct afr_reply *replies); +int +afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode); +#endif /* !_AFR_SELFHEAL_H */ diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c new file mode 100644 index 00000000000..109fd4b7421 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -0,0 +1,1716 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "afr.h" +#include "afr-self-heal.h" +#include "afr-self-heald.h" +#include "protocol-common.h" +#include <glusterfs/syncop-utils.h> +#include "afr-messages.h" +#include <glusterfs/byte-order.h> + +#define AFR_EH_SPLIT_BRAIN_LIMIT 1024 +#define AFR_STATISTICS_HISTORY_SIZE 50 + +#define ASSERT_LOCAL(this, healer) \ + if (!afr_shd_is_subvol_local(this, healer->subvol)) { \ + healer->local = _gf_false; \ + if (safe_break(healer)) { \ + break; \ + } else { \ + continue; \ + } \ + } else { \ + healer->local = _gf_true; \ + } + +#define NTH_INDEX_HEALER(this, n) \ + &((((afr_private_t *)this->private))->shd.index_healers[n]) +#define NTH_FULL_HEALER(this, n) \ + &((((afr_private_t *)this->private))->shd.full_healers[n]) + +char * +afr_subvol_name(xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + + priv = this->private; + if (subvol < 0 || subvol > priv->child_count) + return NULL; + + return priv->children[subvol]->name; +} + +void +afr_destroy_crawl_event_data(void *data) +{ + return; +} + +void +afr_destroy_shd_event_data(void *data) +{ + shd_event_t *shd_event = data; + + if (!shd_event) + return; + GF_FREE(shd_event->path); + + return; +} + +gf_boolean_t +afr_shd_is_subvol_local(xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + gf_boolean_t is_local = _gf_false; + loc_t loc = { + 0, + }; + + loc.inode = this->itable->root; + gf_uuid_copy(loc.gfid, loc.inode->gfid); + priv = this->private; + syncop_is_subvol_local(priv->children[subvol], &loc, &is_local); + return is_local; +} + +int +__afr_shd_healer_wait(struct subvol_healer *healer) +{ + afr_private_t *priv = NULL; + struct timespec wait_till = { + 0, + }; + int ret = 0; + + priv = healer->this->private; + +disabled_loop: + wait_till.tv_sec = gf_time() + priv->shd.timeout; + + while (!healer->rerun) { + ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till); + if (ret == ETIMEDOUT) + break; + } + + ret = healer->rerun; + healer->rerun = 0; + + if (!priv->shd.enabled) + goto disabled_loop; + + return ret; +} + +int +afr_shd_healer_wait(struct subvol_healer *healer) +{ + int ret = 0; + + pthread_mutex_lock(&healer->mutex); + { + ret = __afr_shd_healer_wait(healer); + } + pthread_mutex_unlock(&healer->mutex); + + return ret; +} + +gf_boolean_t +safe_break(struct subvol_healer *healer) +{ + gf_boolean_t ret = _gf_false; + + pthread_mutex_lock(&healer->mutex); + { + if (healer->rerun) + goto unlock; + + healer->running = _gf_false; + ret = _gf_true; + } +unlock: + pthread_mutex_unlock(&healer->mutex); + + return ret; +} + +inode_t * +afr_shd_inode_find(xlator_t *this, xlator_t *subvol, uuid_t gfid) +{ + int ret = 0; + uint64_t val = IA_INVAL; + dict_t *xdata = NULL; + dict_t *rsp_dict = NULL; + inode_t *inode = NULL; + + xdata = dict_new(); + if (!xdata) + goto out; + + ret = dict_set_int8(xdata, GF_INDEX_IA_TYPE_GET_REQ, 1); + if (ret) + goto out; + + ret = syncop_inode_find(this, subvol, gfid, &inode, xdata, &rsp_dict); + if (ret < 0) + goto out; + + if (rsp_dict) { + ret = dict_get_uint64(rsp_dict, GF_INDEX_IA_TYPE_GET_RSP, &val); + if (ret) + goto out; + } + ret = inode_ctx_set2(inode, subvol, 0, &val); +out: + if (ret && inode) { + inode_unref(inode); + inode = NULL; + } + if (xdata) + dict_unref(xdata); + if (rsp_dict) + dict_unref(rsp_dict); + return inode; +} + +inode_t * +afr_shd_index_inode(xlator_t *this, xlator_t *subvol, char *vgfid) +{ + loc_t rootloc = { + 0, + }; + inode_t *inode = NULL; + int ret = 0; + dict_t *xattr = NULL; + void *index_gfid = NULL; + + rootloc.inode = inode_ref(this->itable->root); + gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid); + + ret = syncop_getxattr(subvol, &rootloc, &xattr, vgfid, NULL, NULL); + if (ret || !xattr) { + errno = -ret; + goto out; + } + + ret = dict_get_ptr(xattr, vgfid, &index_gfid); + if (ret) + goto out; + + gf_msg_debug(this->name, 0, "%s dir gfid for %s: %s", vgfid, subvol->name, + uuid_utoa(index_gfid)); + + inode = afr_shd_inode_find(this, subvol, index_gfid); + +out: + loc_wipe(&rootloc); + + if (xattr) + dict_unref(xattr); + + return inode; +} + +int +afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name, + ia_type_t type) +{ + int ret = 0; + loc_t loc = { + 0, + }; + + loc.parent = inode_ref(inode); + loc.name = name; + + if (IA_ISDIR(type)) + ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL); + else + ret = syncop_unlink(subvol, &loc, NULL, NULL); + + loc_wipe(&loc); + return ret; +} + +void +afr_shd_zero_xattrop(xlator_t *this, uuid_t gfid) +{ + call_frame_t *frame = NULL; + inode_t *inode = NULL; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int ret = 0; + int i = 0; + int raw[AFR_NUM_CHANGE_LOGS] = {0}; + + priv = this->private; + frame = afr_frame_create(this, NULL); + if (!frame) + goto out; + inode = afr_inode_find(this, gfid); + if (!inode) + goto out; + xattr = dict_new(); + if (!xattr) + goto out; + ret = dict_set_static_bin(xattr, AFR_DIRTY, raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) + goto out; + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_static_bin(xattr, priv->pending_key[i], raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) + goto out; + } + + /*Send xattrop to all bricks. Doing a lookup to see if bricks are up or + * has valid repies for this gfid seems a bit of an overkill.*/ + for (i = 0; i < priv->child_count; i++) + afr_selfheal_post_op(frame, this, inode, i, xattr, NULL); + +out: + if (frame) + AFR_STACK_DESTROY(frame); + if (inode) + inode_unref(inode); + if (xattr) + dict_unref(xattr); + return; +} + +int +afr_shd_selfheal_name(struct subvol_healer *healer, int child, uuid_t parent, + const char *bname) +{ + int ret = -1; + + ret = afr_selfheal_name(THIS, parent, bname, NULL, NULL); + + return ret; +} + +int +afr_shd_selfheal(struct subvol_healer *healer, int child, uuid_t gfid) +{ + int ret = 0; + eh_t *eh = NULL; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + shd_event_t *shd_event = NULL; + char *path = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + crawl_event_t *crawl_event = NULL; + + this = healer->this; + priv = this->private; + shd = &priv->shd; + crawl_event = &healer->crawl_event; + + subvol = priv->children[child]; + + // If this fails with ENOENT/ESTALE index is stale + ret = syncop_gfid_to_path(this->itable, subvol, gfid, &path); + if (ret < 0) + return ret; + + ret = afr_selfheal(this, gfid); + + LOCK(&priv->lock); + { + if (ret == -EIO) { + eh = shd->split_brain; + crawl_event->split_brain_count++; + } else if (ret < 0) { + crawl_event->heal_failed_count++; + } else if (ret == 0) { + crawl_event->healed_count++; + } + } + UNLOCK(&priv->lock); + + if (eh) { + shd_event = GF_CALLOC(1, sizeof(*shd_event), gf_afr_mt_shd_event_t); + if (!shd_event) + goto out; + + shd_event->child = child; + shd_event->path = path; + + if (eh_save_history(eh, shd_event) < 0) + goto out; + + shd_event = NULL; + path = NULL; + } +out: + GF_FREE(shd_event); + GF_FREE(path); + return ret; +} + +void +afr_shd_sweep_prepare(struct subvol_healer *healer) +{ + crawl_event_t *event = NULL; + + event = &healer->crawl_event; + + event->healed_count = 0; + event->split_brain_count = 0; + event->heal_failed_count = 0; + + event->start_time = gf_time(); + event->end_time = 0; + _mask_cancellation(); +} + +void +afr_shd_sweep_done(struct subvol_healer *healer) +{ + crawl_event_t *event = NULL; + crawl_event_t *history = NULL; + afr_self_heald_t *shd = NULL; + + event = &healer->crawl_event; + shd = &(((afr_private_t *)healer->this->private)->shd); + + event->end_time = gf_time(); + history = gf_memdup(event, sizeof(*event)); + event->start_time = 0; + + if (!history) + return; + + if (eh_save_history(shd->statistics[healer->subvol], history) < 0) + GF_FREE(history); + _unmask_cancellation(); +} + +int +afr_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) +{ + struct subvol_healer *healer = data; + afr_private_t *priv = NULL; + uuid_t gfid = {0}; + int ret = 0; + uint64_t val = IA_INVAL; + + priv = healer->this->private; + if (!priv->shd.enabled) + return -EBUSY; + + gf_msg_debug(healer->this->name, 0, "got entry: %s from %s", entry->d_name, + priv->children[healer->subvol]->name); + + ret = gf_uuid_parse(entry->d_name, gfid); + if (ret) + return 0; + + inode_ctx_get2(parent->inode, subvol, NULL, &val); + + ret = afr_shd_selfheal(healer, healer->subvol, gfid); + + if (ret == -ENOENT || ret == -ESTALE) + afr_shd_entry_purge(subvol, parent->inode, entry->d_name, val); + + if (ret == 2) + /* If bricks crashed in pre-op after creating indices/xattrop + * link but before setting afr changelogs, we end up with stale + * xattrop links but zero changelogs. Remove such entries by + * sending a post-op with zero changelogs. + */ + afr_shd_zero_xattrop(healer->this, gfid); + + return 0; +} + +int +afr_shd_index_sweep(struct subvol_healer *healer, char *vgfid) +{ + loc_t loc = {0}; + afr_private_t *priv = NULL; + int ret = 0; + xlator_t *subvol = NULL; + dict_t *xdata = NULL; + call_frame_t *frame = NULL; + + priv = healer->this->private; + subvol = priv->children[healer->subvol]; + + frame = afr_frame_create(healer->this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + + loc.inode = afr_shd_index_inode(healer->this, subvol, vgfid); + if (!loc.inode) { + gf_msg(healer->this->name, GF_LOG_WARNING, 0, + AFR_MSG_INDEX_DIR_GET_FAILED, "unable to get index-dir on %s", + subvol->name); + ret = -errno; + goto out; + } + + xdata = dict_new(); + if (!xdata || dict_set_int32_sizen(xdata, "get-gfid-type", 1)) { + ret = -ENOMEM; + goto out; + } + + ret = syncop_mt_dir_scan(frame, subvol, &loc, GF_CLIENT_PID_SELF_HEALD, + healer, afr_shd_index_heal, xdata, + priv->shd.max_threads, priv->shd.wait_qlength); + + if (ret == 0) + ret = healer->crawl_event.healed_count; + +out: + loc_wipe(&loc); + + if (xdata) + dict_unref(xdata); + if (frame) + AFR_STACK_DESTROY(frame); + return ret; +} + +int +afr_shd_index_sweep_all(struct subvol_healer *healer) +{ + int ret = 0; + int count = 0; + + ret = afr_shd_index_sweep(healer, GF_XATTROP_INDEX_GFID); + if (ret < 0) + goto out; + count = ret; + + ret = afr_shd_index_sweep(healer, GF_XATTROP_DIRTY_GFID); + if (ret < 0) + goto out; + count += ret; + + ret = afr_shd_index_sweep(healer, GF_XATTROP_ENTRY_CHANGES_GFID); + if (ret < 0) + goto out; + count += ret; +out: + if (ret < 0) + return ret; + else + return count; +} + +int +afr_shd_full_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) +{ + struct subvol_healer *healer = data; + xlator_t *this = healer->this; + afr_private_t *priv = NULL; + + priv = this->private; + + if (this->cleanup_starting) { + return -ENOTCONN; + } + + if (!priv->shd.enabled) + return -EBUSY; + + afr_shd_selfheal_name(healer, healer->subvol, parent->inode->gfid, + entry->d_name); + + afr_shd_selfheal(healer, healer->subvol, entry->d_stat.ia_gfid); + + return 0; +} + +int +afr_shd_full_sweep(struct subvol_healer *healer, inode_t *inode) +{ + afr_private_t *priv = NULL; + loc_t loc = {0}; + + priv = healer->this->private; + loc.inode = inode; + return syncop_ftw(priv->children[healer->subvol], &loc, + GF_CLIENT_PID_SELF_HEALD, healer, afr_shd_full_heal); +} + +int +afr_shd_fill_ta_loc(xlator_t *this, loc_t *loc) +{ + afr_private_t *priv = NULL; + struct iatt stbuf = { + 0, + }; + int ret = -1; + + priv = this->private; + loc->parent = inode_ref(this->itable->root); + gf_uuid_copy(loc->pargfid, loc->parent->gfid); + loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX]; + loc->inode = inode_new(loc->parent->table); + GF_CHECK_ALLOC(loc->inode, ret, out); + + if (!gf_uuid_is_null(priv->ta_gfid)) + goto assign_gfid; + + ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], loc, &stbuf, + 0, 0, 0); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed lookup on file %s.", loc->name); + goto out; + } + + gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid); + +assign_gfid: + gf_uuid_copy(loc->gfid, priv->ta_gfid); + ret = 0; + +out: + if (ret) + loc_wipe(loc); + + return ret; +} + +int +_afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata) +{ + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int raw[AFR_NUM_CHANGE_LOGS] = { + 0, + }; + int ret = -1; + int i = 0; + + priv = this->private; + + xattr = dict_new(); + if (!xattr) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_GET_FAILED, + "Failed to create dict."); + goto out; + } + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_static_bin(xattr, priv->pending_key[i], &raw, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) + goto out; + } + + ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL, xdata, NULL); + if (ret || !(*xdata)) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Xattrop failed on %s.", loc->name); + } + +out: + if (xattr) + dict_unref(xattr); + + return ret; +} + +void +afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, struct subvol_healer *healer, + dict_t **xdata) +{ + int ret = 0; + + loc_wipe(loc); + if (afr_shd_fill_ta_loc(this, loc)) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate thin-arbiter loc for: %s.", loc->name); + ret = -1; + goto out; + } + + ret = afr_ta_post_op_lock(this, loc); + if (ret) + goto out; + + ret = _afr_shd_ta_get_xattrs(this, loc, xdata); + if (ret) { + if (*xdata) { + dict_unref(*xdata); + *xdata = NULL; + } + } + + afr_ta_post_op_unlock(this, loc); + +out: + if (ret) + healer->rerun = 1; +} + +int +afr_shd_ta_unset_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) +{ + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + gf_boolean_t need_xattrop = _gf_false; + void *pending_raw = NULL; + int *raw = NULL; + int pending[AFR_NUM_CHANGE_LOGS] = { + 0, + }; + int i = 0; + int j = 0; + int val = 0; + int ret = -1; + + priv = this->private; + + xattr = dict_new(); + if (!xattr) { + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t); + if (!raw) { + goto out; + } + + ret = dict_get_ptr(*xdata, priv->pending_key[i], &pending_raw); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "Error getting value " + "of pending key %s", + priv->pending_key[i]); + GF_FREE(raw); + goto out; + } + + memcpy(pending, pending_raw, sizeof(pending)); + for (j = 0; j < AFR_NUM_CHANGE_LOGS; j++) { + val = ntoh32(pending[j]); + if (val) { + if (i == healer) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_THIN_ARB, + "I am " + "not the good shd. Skipping. " + "SHD = %d.", + healer); + ret = 0; + GF_FREE(raw); + goto out; + } + need_xattrop = _gf_true; + raw[j] = hton32(-val); + } + } + + ret = dict_set_bin(xattr, priv->pending_key[i], raw, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) { + GF_FREE(raw); + goto out; + } + + if (need_xattrop) + break; + } + + if (!need_xattrop) { + ret = 0; + goto out; + } + + ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Xattrop failed."); + +out: + if (xattr) + dict_unref(xattr); + + return ret; +} + +void +afr_shd_ta_check_and_unset_xattrs(xlator_t *this, loc_t *loc, + struct subvol_healer *healer, + dict_t *pre_crawl_xdata) +{ + int ret_lock = 0; + int ret = 0; + dict_t *post_crawl_xdata = NULL; + + ret_lock = afr_ta_post_op_lock(this, loc); + if (ret_lock) + goto unref; + + ret = _afr_shd_ta_get_xattrs(this, loc, &post_crawl_xdata); + if (ret) + goto unref; + + if (!are_dicts_equal(pre_crawl_xdata, post_crawl_xdata, NULL, NULL)) { + ret = -1; + goto unref; + } + + ret = afr_shd_ta_unset_xattrs(this, loc, &post_crawl_xdata, healer->subvol); + +unref: + if (post_crawl_xdata) { + dict_unref(post_crawl_xdata); + post_crawl_xdata = NULL; + } + + if (ret || ret_lock) + healer->rerun = 1; + + if (!ret_lock) + afr_ta_post_op_unlock(this, loc); +} + +gf_boolean_t +afr_bricks_available_for_heal(afr_private_t *priv) +{ + int up_children = 0; + + up_children = __afr_get_up_children_count(priv); + if (up_children < 2) { + return _gf_false; + } + return _gf_true; +} + +static gf_boolean_t +afr_shd_ta_needs_heal(xlator_t *this, struct subvol_healer *healer) +{ + dict_t *xdata = NULL; + afr_private_t *priv = NULL; + loc_t loc = { + 0, + }; + int ret = -1; + int i = 0; + gf_boolean_t need_heal = _gf_false; + + priv = this->private; + + ret = afr_shd_fill_ta_loc(this, &loc); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate thin-arbiter loc for: %s.", loc.name); + healer->rerun = 1; + goto out; + } + + if (_afr_shd_ta_get_xattrs(this, &loc, &xdata)) { + healer->rerun = 1; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (afr_ta_dict_contains_pending_xattr(xdata, priv, i)) { + need_heal = _gf_true; + break; + } + } + +out: + if (xdata) + dict_unref(xdata); + loc_wipe(&loc); + + return need_heal; +} + +static int +afr_shd_anon_inode_cleaner(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) +{ + struct subvol_healer *healer = data; + afr_private_t *priv = healer->this->private; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int ret = 0; + loc_t loc = {0}; + int count = 0; + int i = 0; + int op_errno = 0; + struct iatt *iatt = NULL; + gf_boolean_t multiple_links = _gf_false; + unsigned char *gfid_present = alloca0(priv->child_count); + unsigned char *entry_present = alloca0(priv->child_count); + char *type = "file"; + + frame = afr_frame_create(healer->this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + local = frame->local; + if (AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) { + gf_msg_debug(healer->this->name, 0, + "Not all bricks are up. Skipping " + "cleanup of %s on %s", + entry->d_name, subvol->name); + ret = 0; + goto out; + } + + loc.inode = inode_new(parent->inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + ret = gf_uuid_parse(entry->d_name, loc.gfid); + if (ret) { + ret = 0; + goto out; + } + AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc, + NULL); + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == 0) { + count++; + gfid_present[i] = 1; + iatt = &local->replies[i].poststat; + if (iatt->ia_type == IA_IFDIR) { + type = "dir"; + } + + if (i == healer->subvol) { + if (local->replies[i].poststat.ia_nlink > 1) { + multiple_links = _gf_true; + } + } + } else if (local->replies[i].op_errno != ENOENT && + local->replies[i].op_errno != ESTALE) { + /*We don't have complete view. Skip the entry*/ + gf_msg_debug(healer->this->name, local->replies[i].op_errno, + "Skipping cleanup of %s on %s", entry->d_name, + subvol->name); + ret = 0; + goto out; + } + } + + /*Inode is deleted from subvol*/ + if (count == 1 || (iatt->ia_type != IA_IFDIR && multiple_links)) { + gf_msg(healer->this->name, GF_LOG_WARNING, 0, + AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging %s %s/%s on %s", type, + priv->anon_inode_name, entry->d_name, subvol->name); + ret = afr_shd_entry_purge(subvol, parent->inode, entry->d_name, + iatt->ia_type); + if (ret == -ENOENT || ret == -ESTALE) + ret = 0; + } else if (count > 1) { + loc_wipe(&loc); + loc.parent = inode_ref(parent->inode); + loc.name = entry->d_name; + loc.inode = inode_new(parent->inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, + &loc, NULL); + count = 0; + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == 0) { + count++; + entry_present[i] = 1; + iatt = &local->replies[i].poststat; + } else if (local->replies[i].op_errno != ENOENT && + local->replies[i].op_errno != ESTALE) { + /*We don't have complete view. Skip the entry*/ + gf_msg_debug(healer->this->name, local->replies[i].op_errno, + "Skipping cleanup of %s on %s", entry->d_name, + subvol->name); + ret = 0; + goto out; + } + } + for (i = 0; i < priv->child_count; i++) { + if (gfid_present[i] && !entry_present[i]) { + /*Entry is not anonymous on at least one subvol*/ + gf_msg_debug(healer->this->name, 0, + "Valid entry present on %s " + "Skipping cleanup of %s on %s", + priv->children[i]->name, entry->d_name, + subvol->name); + ret = 0; + goto out; + } + } + + gf_msg(healer->this->name, GF_LOG_WARNING, 0, + AFR_MSG_EXPUNGING_FILE_OR_DIR, + "expunging %s %s/%s on all subvols", type, priv->anon_inode_name, + entry->d_name); + ret = 0; + for (i = 0; i < priv->child_count; i++) { + op_errno = -afr_shd_entry_purge(priv->children[i], loc.parent, + entry->d_name, iatt->ia_type); + if (op_errno != ENOENT && op_errno != ESTALE) { + ret |= -op_errno; + } + } + } + +out: + if (frame) + AFR_STACK_DESTROY(frame); + loc_wipe(&loc); + return ret; +} + +static void +afr_cleanup_anon_inode_dir(struct subvol_healer *healer) +{ + int ret = 0; + call_frame_t *frame = NULL; + afr_private_t *priv = healer->this->private; + loc_t loc = {0}; + + ret = afr_anon_inode_create(healer->this, healer->subvol, &loc.inode); + if (ret) + goto out; + + frame = afr_frame_create(healer->this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + + ret = syncop_mt_dir_scan(frame, priv->children[healer->subvol], &loc, + GF_CLIENT_PID_SELF_HEALD, healer, + afr_shd_anon_inode_cleaner, NULL, + priv->shd.max_threads, priv->shd.wait_qlength); +out: + if (frame) + AFR_STACK_DESTROY(frame); + loc_wipe(&loc); + return; +} + +void * +afr_shd_index_healer(void *data) +{ + struct subvol_healer *healer = NULL; + xlator_t *this = NULL; + int ret = 0; + afr_private_t *priv = NULL; + dict_t *pre_crawl_xdata = NULL; + loc_t loc = { + 0, + }; + + healer = data; + THIS = this = healer->this; + priv = this->private; + + for (;;) { + afr_shd_healer_wait(healer); + + if (!afr_bricks_available_for_heal(priv)) + continue; + + ASSERT_LOCAL(this, healer); + priv->local[healer->subvol] = healer->local; + + if (priv->thin_arbiter_count) { + if (afr_shd_ta_needs_heal(this, healer)) + afr_shd_ta_get_xattrs(this, &loc, healer, &pre_crawl_xdata); + } + + do { + gf_msg_debug(this->name, 0, "starting index sweep on subvol %s", + afr_subvol_name(this, healer->subvol)); + + afr_shd_sweep_prepare(healer); + + ret = afr_shd_index_sweep_all(healer); + + afr_shd_sweep_done(healer); + /* + As long as at least one gfid was + healed, keep retrying. We may have + just healed a directory and thereby + created entries for other gfids which + could not be healed thus far. + */ + + gf_msg_debug(this->name, 0, "finished index sweep on subvol %s", + afr_subvol_name(this, healer->subvol)); + /* + Give a pause before retrying to avoid a busy loop + in case the only entry in index is because of + an ongoing I/O. + */ + sleep(1); + } while (ret > 0); + + if (ret == 0) { + afr_cleanup_anon_inode_dir(healer); + } + + if (ret == 0 && pre_crawl_xdata && + !healer->crawl_event.heal_failed_count) { + afr_shd_ta_check_and_unset_xattrs(this, &loc, healer, + pre_crawl_xdata); + } + + if (pre_crawl_xdata) { + dict_unref(pre_crawl_xdata); + pre_crawl_xdata = NULL; + } + } + + return NULL; +} + +void * +afr_shd_full_healer(void *data) +{ + struct subvol_healer *healer = NULL; + xlator_t *this = NULL; + int run = 0; + + healer = data; + THIS = this = healer->this; + + for (;;) { + pthread_mutex_lock(&healer->mutex); + { + run = __afr_shd_healer_wait(healer); + if (!run) + healer->running = _gf_false; + } + pthread_mutex_unlock(&healer->mutex); + + if (!run) + break; + + ASSERT_LOCAL(this, healer); + + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, + "starting full sweep on subvol %s", + afr_subvol_name(this, healer->subvol)); + + afr_shd_sweep_prepare(healer); + + afr_shd_full_sweep(healer, this->itable->root); + + afr_shd_sweep_done(healer); + + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, + "finished full sweep on subvol %s", + afr_subvol_name(this, healer->subvol)); + } + + return NULL; +} + +int +afr_shd_healer_init(xlator_t *this, struct subvol_healer *healer) +{ + int ret = 0; + + ret = pthread_mutex_init(&healer->mutex, NULL); + if (ret) + goto out; + + ret = pthread_cond_init(&healer->cond, NULL); + if (ret) + goto out; + + healer->this = this; + healer->running = _gf_false; + healer->rerun = _gf_false; + healer->local = _gf_false; +out: + return ret; +} + +int +afr_shd_healer_spawn(xlator_t *this, struct subvol_healer *healer, + void *(threadfn)(void *)) +{ + int ret = 0; + + pthread_mutex_lock(&healer->mutex); + { + if (healer->running) { + pthread_cond_signal(&healer->cond); + } else { + ret = gf_thread_create(&healer->thread, NULL, threadfn, healer, + "shdheal"); + if (ret) + goto unlock; + healer->running = 1; + } + + healer->rerun = 1; + } +unlock: + pthread_mutex_unlock(&healer->mutex); + + return ret; +} + +int +afr_shd_full_healer_spawn(xlator_t *this, int subvol) +{ + return afr_shd_healer_spawn(this, NTH_FULL_HEALER(this, subvol), + afr_shd_full_healer); +} + +int +afr_shd_index_healer_spawn(xlator_t *this, int subvol) +{ + return afr_shd_healer_spawn(this, NTH_INDEX_HEALER(this, subvol), + afr_shd_index_healer); +} + +int +afr_shd_dict_add_crawl_event(xlator_t *this, dict_t *output, + crawl_event_t *crawl_event) +{ + int ret = 0; + uint64_t count = 0; + char key[128] = {0}; + int keylen = 0; + char suffix[64] = {0}; + int xl_id = 0; + uint64_t healed_count = 0; + uint64_t split_brain_count = 0; + uint64_t heal_failed_count = 0; + char *start_time_str = 0; + char *end_time_str = NULL; + char *crawl_type = NULL; + int progress = -1; + int child = -1; + + child = crawl_event->child; + healed_count = crawl_event->healed_count; + split_brain_count = crawl_event->split_brain_count; + heal_failed_count = crawl_event->heal_failed_count; + crawl_type = crawl_event->crawl_type; + + if (!crawl_event->start_time) + goto out; + + start_time_str = gf_strdup(ctime(&crawl_event->start_time)); + + if (crawl_event->end_time) + end_time_str = gf_strdup(ctime(&crawl_event->end_time)); + + ret = dict_get_int32(output, this->name, &xl_id); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "xl does not have id"); + goto out; + } + + snprintf(key, sizeof(key), "statistics-%d-%d-count", xl_id, child); + ret = dict_get_uint64(output, key, &count); + + snprintf(suffix, sizeof(suffix), "%d-%d-%" PRIu64, xl_id, child, count); + snprintf(key, sizeof(key), "statistics_healed_cnt-%s", suffix); + ret = dict_set_uint64(output, key, healed_count); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not add statistics_healed_count to output"); + goto out; + } + + snprintf(key, sizeof(key), "statistics_sb_cnt-%s", suffix); + ret = dict_set_uint64(output, key, split_brain_count); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not add statistics_split_brain_count to output"); + goto out; + } + + keylen = snprintf(key, sizeof(key), "statistics_crawl_type-%s", suffix); + ret = dict_set_strn(output, key, keylen, crawl_type); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not add statistics_crawl_type to output"); + goto out; + } + + snprintf(key, sizeof(key), "statistics_heal_failed_cnt-%s", suffix); + ret = dict_set_uint64(output, key, heal_failed_count); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not add statistics_healed_failed_count to output"); + goto out; + } + + keylen = snprintf(key, sizeof(key), "statistics_strt_time-%s", suffix); + ret = dict_set_dynstrn(output, key, keylen, start_time_str); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not add statistics_crawl_start_time to output"); + goto out; + } else { + start_time_str = NULL; + } + + if (!end_time_str) + progress = 1; + else + progress = 0; + + keylen = snprintf(key, sizeof(key), "statistics_end_time-%s", suffix); + if (!end_time_str) + end_time_str = gf_strdup("Could not determine the end time"); + ret = dict_set_dynstrn(output, key, keylen, end_time_str); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not add statistics_crawl_end_time to output"); + goto out; + } else { + end_time_str = NULL; + } + + keylen = snprintf(key, sizeof(key), "statistics_inprogress-%s", suffix); + + ret = dict_set_int32n(output, key, keylen, progress); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not add statistics_inprogress to output"); + goto out; + } + + snprintf(key, sizeof(key), "statistics-%d-%d-count", xl_id, child); + ret = dict_set_uint64(output, key, count + 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not increment the counter."); + goto out; + } +out: + GF_FREE(start_time_str); + GF_FREE(end_time_str); + return ret; +} + +int +afr_shd_dict_add_path(xlator_t *this, dict_t *output, int child, char *path, + struct timeval *tv) +{ + int ret = -1; + uint64_t count = 0; + char key[64] = {0}; + int keylen = 0; + char xl_id_child_str[32] = {0}; + int xl_id = 0; + + ret = dict_get_int32(output, this->name, &xl_id); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "xl does not have id"); + goto out; + } + + snprintf(xl_id_child_str, sizeof(xl_id_child_str), "%d-%d", xl_id, child); + snprintf(key, sizeof(key), "%s-count", xl_id_child_str); + ret = dict_get_uint64(output, key, &count); + + keylen = snprintf(key, sizeof(key), "%s-%" PRIu64, xl_id_child_str, count); + ret = dict_set_dynstrn(output, key, keylen, path); + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "%s: Could not add to output", path); + goto out; + } + + if (tv) { + snprintf(key, sizeof(key), "%s-%" PRIu64 "-time", xl_id_child_str, + count); + ret = dict_set_uint32(output, key, tv->tv_sec); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "%s: Could not set time", path); + goto out; + } + } + + snprintf(key, sizeof(key), "%s-count", xl_id_child_str); + + ret = dict_set_uint64(output, key, count + 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Could not increment count"); + goto out; + } + + ret = 0; +out: + return ret; +} + +int +afr_add_shd_event(circular_buffer_t *cb, void *data) +{ + dict_t *output = NULL; + xlator_t *this = THIS; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + shd_event_t *shd_event = NULL; + char *path = NULL; + + output = data; + priv = this->private; + shd = &priv->shd; + shd_event = cb->data; + + if (!shd->index_healers[shd_event->child].local) + return 0; + + path = gf_strdup(shd_event->path); + if (!path) + return -ENOMEM; + + afr_shd_dict_add_path(this, output, shd_event->child, path, &cb->tv); + return 0; +} + +int +afr_add_crawl_event(circular_buffer_t *cb, void *data) +{ + dict_t *output = NULL; + xlator_t *this = THIS; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + crawl_event_t *crawl_event = NULL; + + output = data; + priv = this->private; + shd = &priv->shd; + crawl_event = cb->data; + + if (!shd->index_healers[crawl_event->child].local) + return 0; + + afr_shd_dict_add_crawl_event(this, output, crawl_event); + + return 0; +} + +int +afr_selfheal_daemon_init(xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int ret = -1; + int i = 0; + + priv = this->private; + shd = &priv->shd; + + shd->index_healers = GF_CALLOC(sizeof(*shd->index_healers), + priv->child_count, + gf_afr_mt_subvol_healer_t); + if (!shd->index_healers) + goto out; + + for (i = 0; i < priv->child_count; i++) { + shd->index_healers[i].subvol = i; + ret = afr_shd_healer_init(this, &shd->index_healers[i]); + if (ret) + goto out; + } + + shd->full_healers = GF_CALLOC(sizeof(*shd->full_healers), priv->child_count, + gf_afr_mt_subvol_healer_t); + if (!shd->full_healers) + goto out; + for (i = 0; i < priv->child_count; i++) { + shd->full_healers[i].subvol = i; + ret = afr_shd_healer_init(this, &shd->full_healers[i]); + if (ret) + goto out; + } + + shd->split_brain = eh_new(AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, + afr_destroy_shd_event_data); + if (!shd->split_brain) + goto out; + + shd->statistics = GF_CALLOC(sizeof(eh_t *), priv->child_count, + gf_common_mt_eh_t); + if (!shd->statistics) + goto out; + + for (i = 0; i < priv->child_count; i++) { + shd->statistics[i] = eh_new(AFR_STATISTICS_HISTORY_SIZE, _gf_false, + afr_destroy_crawl_event_data); + if (!shd->statistics[i]) + goto out; + shd->full_healers[i].crawl_event.child = i; + shd->full_healers[i].crawl_event.crawl_type = "FULL"; + shd->index_healers[i].crawl_event.child = i; + shd->index_healers[i].crawl_event.crawl_type = "INDEX"; + } + + ret = 0; +out: + return ret; +} + +void +afr_selfheal_childup(xlator_t *this, afr_private_t *priv) +{ + int subvol = 0; + + if (!priv->shd.iamshd) + return; + for (subvol = 0; subvol < priv->child_count; subvol++) + if (priv->child_up[subvol]) + afr_shd_index_healer_spawn(this, subvol); + + return; +} + +int +afr_shd_get_index_count(xlator_t *this, int i, uint64_t *count) +{ + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + loc_t rootloc = { + 0, + }; + dict_t *xattr = NULL; + int ret = -1; + + priv = this->private; + subvol = priv->children[i]; + + rootloc.inode = inode_ref(this->itable->root); + gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid); + + ret = syncop_getxattr(subvol, &rootloc, &xattr, GF_XATTROP_INDEX_COUNT, + NULL, NULL); + if (ret < 0) + goto out; + + ret = dict_get_uint64(xattr, GF_XATTROP_INDEX_COUNT, count); + if (ret) + goto out; + + ret = 0; + +out: + if (xattr) + dict_unref(xattr); + loc_wipe(&rootloc); + + return ret; +} + +int +afr_xl_op(xlator_t *this, dict_t *input, dict_t *output) +{ + gf_xl_afr_op_t op = GF_SHD_OP_INVALID; + int ret = 0; + int xl_id = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + struct subvol_healer *healer = NULL; + int i = 0; + char key[64]; + int keylen = 0; + int this_name_len = 0; + int op_ret = 0; + uint64_t cnt = 0; + +#define AFR_SET_DICT_AND_LOG(name, output, key, keylen, dict_str, \ + dict_str_len) \ + { \ + int ret; \ + \ + ret = dict_set_nstrn(output, key, keylen, dict_str, dict_str_len); \ + if (ret) { \ + gf_smsg(name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, \ + "key=%s", key, "value=%s", dict_str, NULL); \ + } \ + } + + priv = this->private; + shd = &priv->shd; + + ret = dict_get_int32_sizen(input, "xl-op", (int32_t *)&op); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "key=xl-op", NULL); + goto out; + } + this_name_len = strlen(this->name); + ret = dict_get_int32n(input, this->name, this_name_len, &xl_id); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "key=%s", this->name, NULL); + goto out; + } + ret = dict_set_int32n(output, this->name, this_name_len, xl_id); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "key=%s", this->name, NULL); + goto out; + } + switch (op) { + case GF_SHD_OP_HEAL_INDEX: + op_ret = 0; + + for (i = 0; i < priv->child_count; i++) { + healer = &shd->index_healers[i]; + keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i); + + if (!priv->child_up[i]) { + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SBRICK_NOT_CONNECTED, + SLEN(SBRICK_NOT_CONNECTED)); + op_ret = -1; + } else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) { + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SLESS_THAN2_BRICKS_in_REP, + SLEN(SLESS_THAN2_BRICKS_in_REP)); + op_ret = -1; + } else if (!afr_shd_is_subvol_local(this, healer->subvol)) { + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SBRICK_IS_REMOTE, + SLEN(SBRICK_IS_REMOTE)); + } else { + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SSTARTED_SELF_HEAL, + SLEN(SSTARTED_SELF_HEAL)); + + ret = afr_shd_index_healer_spawn(this, i); + + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_HEALER_SPAWN_FAILED, NULL); + } + } + } + break; + case GF_SHD_OP_HEAL_FULL: + op_ret = -1; + + for (i = 0; i < priv->child_count; i++) { + healer = &shd->full_healers[i]; + keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i); + + if (!priv->child_up[i]) { + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SBRICK_NOT_CONNECTED, + SLEN(SBRICK_NOT_CONNECTED)); + } else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) { + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SLESS_THAN2_BRICKS_in_REP, + SLEN(SLESS_THAN2_BRICKS_in_REP)); + } else if (!afr_shd_is_subvol_local(this, healer->subvol)) { + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SBRICK_IS_REMOTE, + SLEN(SBRICK_IS_REMOTE)); + } else { + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SSTARTED_SELF_HEAL, + SLEN(SSTARTED_SELF_HEAL)); + + ret = afr_shd_full_healer_spawn(this, i); + + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_HEALER_SPAWN_FAILED, NULL); + } + op_ret = 0; + } + } + break; + case GF_SHD_OP_INDEX_SUMMARY: + /* this case has been handled in glfs-heal.c */ + break; + case GF_SHD_OP_SPLIT_BRAIN_FILES: + eh_dump(shd->split_brain, output, afr_add_shd_event); + break; + case GF_SHD_OP_STATISTICS: + for (i = 0; i < priv->child_count; i++) { + eh_dump(shd->statistics[i], output, afr_add_crawl_event); + ret = afr_shd_dict_add_crawl_event( + this, output, &shd->index_healers[i].crawl_event); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL); + } + + ret = afr_shd_dict_add_crawl_event( + this, output, &shd->full_healers[i].crawl_event); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL); + } + } + break; + case GF_SHD_OP_STATISTICS_HEAL_COUNT: + case GF_SHD_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: + op_ret = -1; + + for (i = 0; i < priv->child_count; i++) { + if (!priv->child_up[i]) { + keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, + i); + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SBRICK_NOT_CONNECTED, + SLEN(SBRICK_NOT_CONNECTED)); + } else { + snprintf(key, sizeof(key), "%d-%d-hardlinks", xl_id, i); + ret = afr_shd_get_index_count(this, i, &cnt); + if (ret == 0) { + ret = dict_set_uint64(output, key, cnt); + } + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_DICT_SET_FAILED, NULL); + } + op_ret = 0; + } + } + + break; + + default: + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "op=%d", + op, NULL); + break; + } +out: + dict_deln(output, this->name, this_name_len); + return op_ret; + +#undef AFR_SET_DICT_AND_LOG +} diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h new file mode 100644 index 00000000000..18db728ea7b --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -0,0 +1,75 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _AFR_SELF_HEALD_H +#define _AFR_SELF_HEALD_H + +#include <pthread.h> + +typedef struct { + char *path; + int child; +} shd_event_t; + +typedef struct { + uint64_t healed_count; + uint64_t split_brain_count; + uint64_t heal_failed_count; + + /* If start_time is 0, it means crawler is not in progress + and stats are not valid */ + time_t start_time; + /* If start_time is NOT 0 and end_time is 0, it means + cralwer is in progress */ + time_t end_time; + char *crawl_type; + int child; +} crawl_event_t; + +struct subvol_healer { + xlator_t *this; + crawl_event_t crawl_event; + pthread_mutex_t mutex; + pthread_cond_t cond; + pthread_t thread; + int subvol; + gf_boolean_t local; + gf_boolean_t running; + gf_boolean_t rerun; +}; + +typedef struct { + struct subvol_healer *index_healers; + struct subvol_healer *full_healers; + + eh_t *split_brain; + eh_t **statistics; + int timeout; + uint32_t max_threads; + uint32_t wait_qlength; + uint32_t halo_max_latency_msec; + gf_boolean_t iamshd; + gf_boolean_t enabled; +} afr_self_heald_t; + +int +afr_selfheal_daemon_init(xlator_t *this); + +int +afr_xl_op(xlator_t *this, dict_t *input, dict_t *output); + +int +afr_shd_gfid_to_path(xlator_t *this, xlator_t *subvol, uuid_t gfid, + char **path_p); + +int +afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name, + ia_type_t type); +#endif /* !_AFR_SELF_HEALD_H */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 4afb7ce6f2a..a51f79b1f43 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -1,1242 +1,2927 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#include "dict.h" -#include "byte-order.h" -#include "common-utils.h" +#include <glusterfs/dict.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/timer.h> #include "afr.h" #include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-messages.h" #include <signal.h> +typedef enum { + AFR_TRANSACTION_PRE_OP, + AFR_TRANSACTION_POST_OP, +} afr_xattrop_type_t; -#define LOCKED_NO 0x0 /* no lock held */ -#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path - of RENAME */ -#define LOCKED_LOWER 0x2 /* for lower_path of RENAME */ +static void +afr_lock_resume_shared(struct list_head *list); +static void +afr_post_op_handle_success(call_frame_t *frame, xlator_t *this); -afr_fd_ctx_t * -afr_fd_ctx_get (fd_t *fd, xlator_t *this) -{ - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = 0; +static void +afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno); - ret = fd_ctx_get (fd, this, &ctx); +void +__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared); - if (ret < 0) - goto out; +void +afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this); - fd_ctx = (afr_fd_ctx_t *)(long) ctx; +int +afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this); -out: - return fd_ctx; -} +gf_boolean_t +afr_changelog_pre_op_uninherit(call_frame_t *frame, xlator_t *this); +gf_boolean_t +afr_changelog_pre_op_update(call_frame_t *frame, xlator_t *this); + +int +afr_changelog_call_count(afr_transaction_type type, + unsigned char *pre_op_subvols, + unsigned char *failed_subvols, + unsigned int child_count); +int +afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume, + afr_xattrop_type_t op); static void -afr_pid_save (call_frame_t *frame) -{ - afr_local_t * local = NULL; +afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this); - local = frame->local; +static int +afr_ta_post_op_do(void *opaque); - local->saved_pid = frame->root->pid; -} +static int +afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local); +static int +afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this); static void -afr_pid_restore (call_frame_t *frame) +afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno); + +void +afr_ta_locked_priv_invalidate(afr_private_t *priv) +{ + priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; + priv->release_ta_notify_dom_lock = _gf_false; + priv->ta_notify_dom_lock_offset = 0; +} + +static void +afr_ta_process_waitq(xlator_t *this) { - afr_local_t * local = NULL; + afr_local_t *entry = NULL; + afr_private_t *priv = this->private; + struct list_head waitq = { + 0, + }; + + INIT_LIST_HEAD(&waitq); + LOCK(&priv->lock); + list_splice_init(&priv->ta_waitq, &waitq); + UNLOCK(&priv->lock); + list_for_each_entry(entry, &waitq, ta_waitq) + { + afr_ta_decide_post_op_state(entry->transaction.frame, this); + } +} - local = frame->local; +int +afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque) +{ + afr_ta_process_waitq(ta_frame->this); + STACK_DESTROY(ta_frame->root); + return 0; +} - frame->root->pid = local->saved_pid; +int +afr_release_notify_lock_for_ta(void *opaque) +{ + xlator_t *this = NULL; + afr_private_t *priv = NULL; + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; + int ret = -1; + + this = (xlator_t *)opaque; + priv = this->private; + ret = afr_fill_ta_loc(this, &loc, _gf_true); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate loc for thin-arbiter."); + goto out; + } + flock.l_type = F_UNLCK; + flock.l_start = priv->ta_notify_dom_lock_offset; + flock.l_len = 1; + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_NOTIFY, &loc, F_SETLK, &flock, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to unlock AFR_TA_DOM_NOTIFY lock."); + } + + LOCK(&priv->lock); + { + afr_ta_locked_priv_invalidate(priv); + } + UNLOCK(&priv->lock); +out: + loc_wipe(&loc); + return ret; } +void +afr_zero_fill_stat(afr_local_t *local) +{ + if (!local) + return; + if (local->transaction.type == AFR_DATA_TRANSACTION || + local->transaction.type == AFR_METADATA_TRANSACTION) { + gf_zero_fill_stat(&local->cont.inode_wfop.prebuf); + gf_zero_fill_stat(&local->cont.inode_wfop.postbuf); + } else if (local->transaction.type == AFR_ENTRY_TRANSACTION || + local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + gf_zero_fill_stat(&local->cont.dir_fop.buf); + gf_zero_fill_stat(&local->cont.dir_fop.preparent); + gf_zero_fill_stat(&local->cont.dir_fop.postparent); + if (local->transaction.type == AFR_ENTRY_TRANSACTION) + return; + gf_zero_fill_stat(&local->cont.dir_fop.prenewparent); + gf_zero_fill_stat(&local->cont.dir_fop.postnewparent); + } +} -static void -__mark_all_pending (int32_t *pending[], int child_count, - afr_transaction_type type) +/* In case of errors afr needs to choose which xdata from lower xlators it needs + * to unwind with. The way it is done is by checking if there are + * any good subvols which failed. Give preference to errnos other than + * ENOTCONN even if the child is source */ +void +afr_pick_error_xdata(afr_local_t *local, afr_private_t *priv, inode_t *inode1, + unsigned char *readable1, inode_t *inode2, + unsigned char *readable2) { - int i; - int j; + int s = -1; /*selection*/ + int i = 0; + unsigned char *readable = NULL; + + if (local->xdata_rsp) { + dict_unref(local->xdata_rsp); + local->xdata_rsp = NULL; + } + + readable = alloca0(priv->child_count * sizeof(*readable)); + if (inode2 && readable2) { /*rename fop*/ + AFR_INTERSECT(readable, readable1, readable2, priv->child_count); + } else { + memcpy(readable, readable1, sizeof(*readable) * priv->child_count); + } + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + + if (local->replies[i].op_ret >= 0) + continue; + + if (local->replies[i].op_errno == ENOTCONN) + continue; + + /*Order is important in the following condition*/ + if ((s < 0) || (!readable[s] && readable[i])) + s = i; + } + + if (s != -1 && local->replies[s].xdata) { + local->xdata_rsp = dict_ref(local->replies[s].xdata); + } else if (s == -1) { + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + + if (local->replies[i].op_ret >= 0) + continue; - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - pending[i][j] = hton32 (1); + if (!local->replies[i].xdata) + continue; + local->xdata_rsp = dict_ref(local->replies[i].xdata); + break; } + } } +gf_boolean_t +afr_needs_changelog_update(afr_local_t *local) +{ + if (local->transaction.type == AFR_DATA_TRANSACTION) + return _gf_true; + if (!local->optimistic_change_log) + return _gf_true; + return _gf_false; +} -static void -__mark_child_dead (int32_t *pending[], int child_count, int child, - afr_transaction_type type) +gf_boolean_t +afr_changelog_has_quorum(afr_local_t *local, xlator_t *this) { - int j; + afr_private_t *priv = NULL; + int i = 0; + unsigned char *success_children = NULL; - j = afr_index_for_transaction_type (type); + priv = this->private; + success_children = alloca0(priv->child_count); - pending[child][j] = 0; + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.failed_subvols[i]) { + success_children[i] = 1; + } + } + + if (afr_has_quorum(success_children, this, NULL)) { + return _gf_true; + } + + return _gf_false; } +gf_boolean_t +afr_is_write_subvol_valid(call_frame_t *frame, xlator_t *this) +{ + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + uint64_t write_subvol = 0; + unsigned char *writable = NULL; + uint16_t datamap = 0; + + local = frame->local; + priv = this->private; + writable = alloca0(priv->child_count); + + write_subvol = afr_write_subvol_get(frame, this); + datamap = (write_subvol & 0x00000000ffff0000) >> 16; + for (i = 0; i < priv->child_count; i++) { + if (datamap & (1 << i)) + writable[i] = 1; + + if (writable[i] && !local->transaction.failed_subvols[i]) + return _gf_true; + } + + return _gf_false; +} -static void -__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index) +int +afr_transaction_fop(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + unsigned char *failed_subvols = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + failed_subvols = local->transaction.failed_subvols; + call_count = priv->child_count - + AFR_COUNT(failed_subvols, priv->child_count); + /* Fail if pre-op did not succeed on quorum no. of bricks. */ + if (!afr_changelog_has_quorum(local, this) || !call_count) { + local->op_ret = -1; + /* local->op_errno is already captured in changelog cbk. */ + afr_transaction_resume(frame, this); + return 0; + } + + /* Fail if at least one writeable brick isn't up.*/ + if (local->transaction.type == AFR_DATA_TRANSACTION && + !afr_is_write_subvol_valid(frame, this)) { + local->op_ret = -1; + local->op_errno = EIO; + afr_transaction_resume(frame, this); + return 0; + } - local = frame->local; + local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && !failed_subvols[i]) { + local->transaction.wind(frame, this, i); - if (!local->fd) - return; + if (!--call_count) + break; + } + } - fd_ctx = afr_fd_ctx_get (local->fd, this); + return 0; +} - if (!fd_ctx) - goto out; +int +afr_transaction_done(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_boolean_t unwind = _gf_false; + afr_lock_t *lock = NULL; + afr_local_t *lock_local = NULL; + + priv = this->private; + local = frame->local; - LOCK (&local->fd->lock); + if (priv->consistent_metadata) { + LOCK(&frame->lock); { - if (local->transaction.type == AFR_DATA_TRANSACTION) - fd_ctx->pre_op_done[child_index]++; + unwind = (local->transaction.main_frame != NULL); } - UNLOCK (&local->fd->lock); -out: - return; + UNLOCK(&frame->lock); + if (unwind) /*It definitely did post-op*/ + afr_zero_fill_stat(local); + } + + if (local->transaction.do_eager_unlock) { + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK(&local->inode->lock); + { + lock->acquired = _gf_false; + lock->release = _gf_false; + list_splice_init(&lock->frozen, &lock->waiting); + if (list_empty(&lock->waiting)) + goto unlock; + lock_local = list_entry(lock->waiting.next, afr_local_t, + transaction.wait_list); + list_del_init(&lock_local->transaction.wait_list); + list_add(&lock_local->transaction.owner_list, &lock->owners); + } + unlock: + UNLOCK(&local->inode->lock); + } + if (lock_local) { + afr_lock(lock_local->transaction.frame, + lock_local->transaction.frame->this); + } + local->transaction.unwind(frame, this); + + GF_ASSERT(list_empty(&local->transaction.owner_list)); + GF_ASSERT(list_empty(&local->transaction.wait_list)); + AFR_STACK_DESTROY(frame); + + return 0; } +static void +afr_lock_fail_shared(afr_local_t *local, struct list_head *list) +{ + afr_local_t *each = NULL; + + while (!list_empty(list)) { + each = list_entry(list->next, afr_local_t, transaction.wait_list); + list_del_init(&each->transaction.wait_list); + each->op_ret = -1; + each->op_errno = local->op_errno; + afr_transaction_done(each->transaction.frame, + each->transaction.frame->this); + } +} static void -__mark_pre_op_undone_on_fd (call_frame_t *frame, xlator_t *this, int child_index) +afr_handle_lock_acquire_failure(afr_local_t *local) { - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; + struct list_head shared; + afr_lock_t *lock = NULL; - local = frame->local; + if (!local->transaction.eager_lock_on) + goto out; - if (!local->fd) - return; + lock = &local->inode_ctx->lock[local->transaction.type]; - fd_ctx = afr_fd_ctx_get (local->fd, this); + INIT_LIST_HEAD(&shared); + LOCK(&local->inode->lock); + { + lock->release = _gf_true; + list_splice_init(&lock->waiting, &shared); + } + UNLOCK(&local->inode->lock); - if (!fd_ctx) - goto out; - - LOCK (&local->fd->lock); - { - if (local->transaction.type == AFR_DATA_TRANSACTION) - fd_ctx->pre_op_done[child_index]--; - } - UNLOCK (&local->fd->lock); + afr_lock_fail_shared(local, &shared); + local->transaction.do_eager_unlock = _gf_true; out: - return; + local->internal_lock.lock_cbk = afr_transaction_done; + afr_unlock(local->transaction.frame, local->transaction.frame->this); } - -static void -__mark_down_children (int32_t *pending[], int child_count, - unsigned char *child_up, afr_transaction_type type) +call_frame_t * +afr_transaction_detach_fop_frame(call_frame_t *frame) { - int i; - int j; + afr_local_t *local = NULL; + call_frame_t *fop_frame = NULL; - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); + local = frame->local; - if (!child_up[i]) - pending[i][j] = 0; - } -} + afr_handle_inconsistent_fop(frame, &local->op_ret, &local->op_errno); + LOCK(&frame->lock); + { + fop_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK(&frame->lock); + return fop_frame; +} static void -__mark_all_success (int32_t *pending[], int child_count, - afr_transaction_type type) +afr_save_lk_owner(call_frame_t *frame) { - int i; - int j; + afr_local_t *local = NULL; - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - pending[i][j] = hton32 (-1); - } -} + local = frame->local; + local->saved_lk_owner = frame->root->lk_owner; +} -static int -__changelog_enabled (afr_private_t *priv, afr_transaction_type type) +static void +afr_restore_lk_owner(call_frame_t *frame) { - int ret = 0; - - switch (type) { - case AFR_DATA_TRANSACTION: - if (priv->data_change_log) - ret = 1; + afr_local_t *local = NULL; - break; + local = frame->local; - case AFR_METADATA_TRANSACTION: - if (priv->metadata_change_log) - ret = 1; + frame->root->lk_owner = local->saved_lk_owner; +} - break; +void +__mark_all_success(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i; - case AFR_ENTRY_TRANSACTION: - case AFR_ENTRY_RENAME_TRANSACTION: - if (priv->entry_change_log) - ret = 1; + local = frame->local; + priv = this->private; - break; - } + for (i = 0; i < priv->child_count; i++) { + local->transaction.failed_subvols[i] = 0; + } +} - return ret; +void +afr_compute_pre_op_sources(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_transaction_type type = -1; + dict_t *xdata = NULL; + int **matrix = NULL; + int idx = -1; + int i = 0; + int j = 0; + + priv = this->private; + local = frame->local; + type = local->transaction.type; + idx = afr_index_for_transaction_type(type); + matrix = ALLOC_MATRIX(priv->child_count, int); + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.changelog_xdata[i]) + continue; + xdata = local->transaction.changelog_xdata[i]; + afr_selfheal_fill_matrix(this, matrix, i, idx, xdata); + } + + memset(local->transaction.pre_op_sources, 1, priv->child_count); + + /*If lock or pre-op failed on a brick, it is not a source. */ + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + local->transaction.pre_op_sources[i] = 0; + } + + /* If brick is blamed by others, it is not a source. */ + for (i = 0; i < priv->child_count; i++) + for (j = 0; j < priv->child_count; j++) + if (matrix[i][j] != 0) + local->transaction.pre_op_sources[j] = 0; } +void +afr_txn_arbitrate_fop(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int pre_op_sources_count = 0; + int i = 0; + + priv = this->private; + local = frame->local; + + afr_compute_pre_op_sources(frame, this); + pre_op_sources_count = AFR_COUNT(local->transaction.pre_op_sources, + priv->child_count); + + /* If arbiter is the only source, do not proceed. */ + if (pre_op_sources_count < 2 && + local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) { + local->op_ret = -1; + local->op_errno = ENOTCONN; + for (i = 0; i < priv->child_count; i++) + local->transaction.failed_subvols[i] = 1; + } + + afr_transaction_fop(frame, this); + + return; +} -static int -__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this) +int +afr_transaction_perform_fop(call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int ret = 0; + int failure_count = 0; + struct list_head shared; + afr_lock_t *lock = NULL; + + local = frame->local; + priv = this->private; + + INIT_LIST_HEAD(&shared); + if (local->transaction.type == AFR_DATA_TRANSACTION && + !local->transaction.inherited) { + ret = afr_write_subvol_set(frame, this); + if (ret) { + /*act as if operation failed on all subvols*/ + local->op_ret = -1; + local->op_errno = -ret; + for (i = 0; i < priv->child_count; i++) + local->transaction.failed_subvols[i] = 1; + } + } + + if (local->pre_op_compat) + /* old mode, pre-op was done as afr_changelog_do() + just now, before OP */ + afr_changelog_pre_op_update(frame, this); + + if (!local->transaction.eager_lock_on || local->transaction.inherited) + goto fop; + failure_count = AFR_COUNT(local->transaction.failed_subvols, + priv->child_count); + if (failure_count == priv->child_count) { + afr_handle_lock_acquire_failure(local); + return 0; + } else { + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK(&local->inode->lock); + { + lock->acquired = _gf_true; + __afr_transaction_wake_shared(local, &shared); + } + UNLOCK(&local->inode->lock); + } + +fop: + /* Perform fops with the lk-owner from top xlator. + * Eg: lk-owner of posix-lk and flush should be same, + * flush cant clear the posix-lks without that lk-owner. + */ + afr_save_lk_owner(frame); + frame->root->lk_owner = local->transaction.main_frame->root->lk_owner; + + if (priv->arbiter_count == 1) { + afr_txn_arbitrate_fop(frame, this); + } else { + afr_transaction_fop(frame, this); + } + + afr_lock_resume_shared(&shared); + return 0; +} - int op_ret = 0; +int +afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int **pending) +{ + int i = 0; + int ret = 0; - priv = this->private; - local = frame->local; + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_static_bin(xattr, priv->pending_key[i], pending[i], + AFR_NUM_CHANGE_LOGS * sizeof(int)); + /* 3 = data+metadata+entry */ - if (__changelog_enabled (priv, local->transaction.type)) { - switch (local->op) { + if (ret) + break; + } - case GF_FOP_WRITE: - case GF_FOP_FTRUNCATE: - op_ret = 1; - break; + return ret; +} - case GF_FOP_FLUSH: - op_ret = 0; - break; +static void +afr_ta_dom_lock_check_and_release(afr_ta_fop_state_t fop_state, xlator_t *this) +{ + afr_private_t *priv = this->private; + unsigned int inmem_count = 0; + unsigned int onwire_count = 0; + gf_boolean_t release = _gf_false; + + LOCK(&priv->lock); + { + /*Once we get notify lock release upcall notification, + if any of the fop state counters are non-zero, we will + not release the lock. + */ + onwire_count = priv->ta_on_wire_txn_count; + inmem_count = priv->ta_in_mem_txn_count; + switch (fop_state) { + case TA_GET_INFO_FROM_TA_FILE: + onwire_count = --priv->ta_on_wire_txn_count; + break; + case TA_INFO_IN_MEMORY_SUCCESS: + case TA_INFO_IN_MEMORY_FAILED: + inmem_count = --priv->ta_in_mem_txn_count; + break; + case TA_WAIT_FOR_NOTIFY_LOCK_REL: + GF_ASSERT(0); + break; + case TA_SUCCESS: + break; + } + release = priv->release_ta_notify_dom_lock; + } + UNLOCK(&priv->lock); - default: - op_ret = 1; - } - } + if (inmem_count != 0 || release == _gf_false || onwire_count != 0) + return; - return op_ret; + afr_ta_lock_release_synctask(this); } +static void +afr_ta_process_onwireq(afr_ta_fop_state_t fop_state, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *entry = NULL; + int bad_child = AFR_CHILD_UNKNOWN; + + struct list_head onwireq = { + 0, + }; + INIT_LIST_HEAD(&onwireq); + + LOCK(&priv->lock); + { + bad_child = priv->ta_bad_child_index; + if (bad_child == AFR_CHILD_UNKNOWN) { + /*The previous on-wire ta_post_op was a failure. Just dequeue + *one element to wind on-wire again. */ + entry = list_entry(priv->ta_onwireq.next, afr_local_t, ta_onwireq); + list_del_init(&entry->ta_onwireq); + } else { + /* Prepare to process all fops based on bad_child_index. */ + list_splice_init(&priv->ta_onwireq, &onwireq); + } + } + UNLOCK(&priv->lock); + + if (entry) { + afr_ta_post_op_synctask(this, entry); + return; + } else { + while (!list_empty(&onwireq)) { + entry = list_entry(onwireq.next, afr_local_t, ta_onwireq); + list_del_init(&entry->ta_onwireq); + if (entry->ta_failed_subvol == bad_child) { + afr_post_op_handle_success(entry->transaction.frame, this); + } else { + afr_post_op_handle_failure(entry->transaction.frame, this, EIO); + } + } + } +} -static int -__changelog_needed_post_op (call_frame_t *frame, xlator_t *this) +int +afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + int_lock = &local->internal_lock; + + if (priv->thin_arbiter_count) { + /*fop should not come here with TA_WAIT_FOR_NOTIFY_LOCK_REL state */ + afr_ta_dom_lock_check_and_release(local->fop_state, this); + } + + /* Fail the FOP if post-op did not succeed on quorum no. of bricks. */ + if (!afr_changelog_has_quorum(local, this)) { + local->op_ret = -1; + /*local->op_errno is already captured in changelog cbk*/ + } + + if (local->transaction.resume_stub) { + call_resume(local->transaction.resume_stub); + local->transaction.resume_stub = NULL; + } + + int_lock->lock_cbk = afr_transaction_done; + afr_unlock(frame, this); + + return 0; +} - int op_ret = 0; - afr_transaction_type type = -1; +static void +afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno) +{ + afr_local_t *local = frame->local; + local->op_ret = -1; + local->op_errno = op_errno; - priv = this->private; - local = frame->local; - type = local->transaction.type; + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_THIN_ARB, + "Failing %s for gfid %s. Fop state is:%d", gf_fop_list[local->op], + uuid_utoa(local->inode->gfid), local->fop_state); - if (__changelog_enabled (priv, type)) { - switch (local->op) { + afr_changelog_post_op_done(frame, this); +} - case GF_FOP_WRITE: - case GF_FOP_FTRUNCATE: - op_ret = 1; - break; +unsigned char * +afr_locked_nodes_get(afr_transaction_type type, afr_internal_lock_t *int_lock) +{ + /*Because same set of subvols participate in all lockee + * entities*/ + return int_lock->lockee[0].locked_nodes; +} - case GF_FOP_FLUSH: - op_ret = 0; - break; +int +afr_changelog_call_count(afr_transaction_type type, + unsigned char *pre_op_subvols, + unsigned char *failed_subvols, + unsigned int child_count) +{ + int i = 0; + int call_count = 0; - default: - op_ret = 1; - } + for (i = 0; i < child_count; i++) { + if (pre_op_subvols[i] && !failed_subvols[i]) { + call_count++; } + } - return op_ret; -} + if (type == AFR_ENTRY_RENAME_TRANSACTION) + call_count *= 2; + return call_count; +} -static int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending) +gf_boolean_t +afr_txn_nothing_failed(call_frame_t *frame, xlator_t *this) { - int i; - int ret = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + if (priv->thin_arbiter_count) { + /* We need to perform post-op even if 1 data brick was down + * before the txn started.*/ + if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count)) + return _gf_false; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && + local->transaction.failed_subvols[i]) + return _gf_false; + } + + return _gf_true; +} - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_static_bin (xattr, priv->pending_key[i], - pending[i], 3 * sizeof (int32_t)); - /* 3 = data+metadata+entry */ +void +afr_handle_symmetric_errors(call_frame_t *frame, xlator_t *this) +{ + if (afr_is_symmetric_error(frame, this)) + __mark_all_success(frame, this); +} - if (ret < 0) - goto out; +gf_boolean_t +afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame) +{ + unsigned int quorum_count = 0; + afr_private_t *priv = NULL; + unsigned int up_children_count = 0; + + priv = this->private; + up_children_count = AFR_COUNT(subvols, priv->child_count); + + if (afr_lookup_has_quorum(frame, up_children_count)) + return _gf_true; + + if (priv->quorum_count == AFR_QUORUM_AUTO) { + /* + * Special case for auto-quorum with an even number of nodes. + * + * A replica set with even count N can only handle the same + * number of failures as odd N-1 before losing "vanilla" + * quorum, and the probability of more simultaneous failures is + * actually higher. For example, with a 1% chance of failure + * we'd have a 0.03% chance of two simultaneous failures with + * N=3 but a 0.06% chance with N=4. However, the special case + * is necessary for N=2 because there's no real quorum in that + * case (i.e. can't normally survive *any* failures). In that + * case, we treat the first node as a tie-breaker, allowing + * quorum to be retained in some cases while still honoring the + * all-important constraint that there can not simultaneously + * be two partitioned sets of nodes each believing they have + * quorum. Of two equally sized sets, the one without that + * first node will lose. + * + * It turns out that the special case is beneficial for higher + * values of N as well. Continuing the example above, the + * probability of losing quorum with N=4 and this type of + * quorum is (very) slightly lower than with N=3 and vanilla + * quorum. The difference becomes even more pronounced with + * higher N. Therefore, even though such replica counts are + * unlikely to be seen in practice, we might as well use the + * "special" quorum then as well. + */ + if ((up_children_count * 2) == priv->child_count) { + return subvols[0]; } + } -out: - return ret; + if (priv->quorum_count == AFR_QUORUM_AUTO) { + quorum_count = priv->child_count / 2 + 1; + } else { + quorum_count = priv->quorum_count; + } + + if (up_children_count >= quorum_count) + return _gf_true; + + return _gf_false; } +static gf_boolean_t +afr_has_fop_quorum(call_frame_t *frame) +{ + xlator_t *this = frame->this; + afr_local_t *local = frame->local; + unsigned char *locked_nodes = NULL; -static int -afr_set_piggyback_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, - afr_transaction_type type) + locked_nodes = afr_locked_nodes_get(local->transaction.type, + &local->internal_lock); + return afr_has_quorum(locked_nodes, this, NULL); +} + +static gf_boolean_t +afr_has_fop_cbk_quorum(call_frame_t *frame) { - int i; - int ret = 0; - int *arr = NULL; - int index = 0; - size_t pending_xattr_size = 3 * sizeof (int32_t); - /* 3 = data+metadata+entry */ + afr_local_t *local = frame->local; + xlator_t *this = frame->this; + afr_private_t *priv = this->private; + unsigned char *success = alloca0(priv->child_count); + int i = 0; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) + if (!local->transaction.failed_subvols[i]) + success[i] = 1; + } + + return afr_has_quorum(success, this, NULL); +} - index = afr_index_for_transaction_type (type); +gf_boolean_t +afr_need_dirty_marking(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *local = NULL; + gf_boolean_t need_dirty = _gf_false; - for (i = 0; i < priv->child_count; i++) { - arr = GF_CALLOC (1, pending_xattr_size, - gf_afr_mt_char); - if (!arr) { - ret = -1; - goto out; - } + local = frame->local; - memcpy (arr, pending[i], pending_xattr_size); + if (!priv->quorum_count || !local->optimistic_change_log) + return _gf_false; - arr[index]++; + if (local->transaction.type == AFR_DATA_TRANSACTION || + local->transaction.type == AFR_METADATA_TRANSACTION) + return _gf_false; - ret = dict_set_bin (xattr, priv->pending_key[i], - arr, pending_xattr_size); + if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count) == + priv->child_count) + return _gf_false; - if (ret < 0) - goto out; - } + if (!afr_has_fop_cbk_quorum(frame)) + need_dirty = _gf_true; -out: - return ret; + return need_dirty; } - -int -afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) +void +afr_handle_quorum(call_frame_t *frame, xlator_t *this) { - int ret = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + const char *file = NULL; + uuid_t gfid = {0}; + + local = frame->local; + priv = frame->this->private; - switch (type) { - case AFR_DATA_TRANSACTION: - ret = priv->child_count; - break; + if (priv->quorum_count == 0) + return; - case AFR_METADATA_TRANSACTION: - ret = priv->child_count; - break; + /* If the fop already failed return right away to preserve errno */ + if (local->op_ret == -1) + return; - case AFR_ENTRY_TRANSACTION: - case AFR_ENTRY_RENAME_TRANSACTION: - ret = priv->child_count; - break; - } + /* + * Network split may happen just after the fops are unwound, so check + * if the fop succeeded in a way it still follows quorum. If it doesn't, + * mark the fop as failure, mark the changelogs so it reflects that + * failure. + * + * Scenario: + * There are 3 mounts on 3 machines(node1, node2, node3) all writing to + * single file. Network split happened in a way that node1 can't see + * node2, node3. Node2, node3 both of them can't see node1. Now at the + * time of sending write all the bricks are up. Just after write fop is + * wound on node1, network split happens. Node1 thinks write fop failed + * on node2, node3 so marks pending changelog for those 2 extended + * attributes on node1. Node2, node3 thinks writes failed on node1 so + * they mark pending changelog for node1. When the network is stable + * again the file already is in split-brain. These checks prevent + * marking pending changelog on other subvolumes if the fop doesn't + * succeed in a way it is still following quorum. So with this fix what + * is happening is, node1 will have all pending changelog(FOOL) because + * the write succeeded only on node1 but failed on node2, node3 so + * instead of marking pending changelogs on node2, node3 it just treats + * the fop as failure and goes into DIRTY state. Where as node2, node3 + * say they are sources and have pending changelog to node1 so there is + * no split-brain with the fix. The problem is eliminated completely. + */ + + if (afr_has_fop_cbk_quorum(frame)) + return; - return ret; + if (afr_need_dirty_marking(frame, this)) + goto set_response; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) + afr_transaction_fop_failed(frame, frame->this, i); + } + +set_response: + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + if (local->op_errno == 0) + local->op_errno = afr_quorum_errno(priv); + + if (local->fd) { + gf_uuid_copy(gfid, local->fd->inode->gfid); + file = uuid_utoa(gfid); + } else { + loc_path(&local->loc, local->loc.name); + file = local->loc.path; + } + + gf_msg(frame->this->name, GF_LOG_WARNING, local->op_errno, + AFR_MSG_QUORUM_FAIL, "%s: Failing %s as quorum is not met", file, + gf_fop_list[local->op]); + + switch (local->transaction.type) { + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + afr_pick_error_xdata(local, priv, local->parent, local->readable, + local->parent2, local->readable2); + break; + default: + afr_pick_error_xdata(local, priv, local->inode, local->readable, + NULL, NULL); + break; + } } -/* {{{ pending */ +int +afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop) +{ + afr_private_t *priv = NULL; + + priv = this->private; + loc->parent = inode_ref(priv->root_inode); + gf_uuid_copy(loc->pargfid, loc->parent->gfid); + loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX]; + if (is_gfid_based_fop && gf_uuid_is_null(priv->ta_gfid)) { + /* Except afr_ta_id_file_check() which is path based, all other gluster + * FOPS need gfid.*/ + return -EINVAL; + } + gf_uuid_copy(loc->gfid, priv->ta_gfid); + loc->inode = inode_new(loc->parent->table); + if (!loc->inode) { + loc_wipe(loc); + return -ENOMEM; + } + return 0; +} -int32_t -afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) +static int +afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque) { - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int child_index = 0; + xlator_t *this = NULL; + afr_local_t *local = NULL; + call_frame_t *txn_frame = NULL; + afr_ta_fop_state_t fop_state; + + local = (afr_local_t *)opaque; + fop_state = local->fop_state; + txn_frame = local->transaction.frame; + this = frame->this; + + if (ret == 0) { + /*Mark pending xattrs on the up data brick.*/ + afr_post_op_handle_success(txn_frame, this); + } else { + afr_post_op_handle_failure(txn_frame, this, -ret); + } + + STACK_DESTROY(frame->root); + afr_ta_process_onwireq(fop_state, this); + + return 0; +} - int call_count = -1; +int ** +afr_set_changelog_xattr(afr_private_t *priv, unsigned char *pending, + dict_t *xattr, afr_local_t *local) +{ + int **changelog = NULL; + int idx = 0; + int ret = 0; + int i; + + if (local->is_new_entry == _gf_true) { + changelog = afr_mark_pending_changelog(priv, pending, xattr, + local->cont.dir_fop.buf.ia_type); + } else { + idx = afr_index_for_transaction_type(local->transaction.type); + changelog = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!changelog) { + goto out; + } + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + changelog[i][idx] = hton32(1); + } + ret = afr_set_pending_dict(priv, xattr, changelog); + if (ret < 0) { + afr_matrix_cleanup(changelog, priv->child_count); + return NULL; + } + } - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; +out: + return changelog; +} - child_index = (long) cookie; +static void +afr_ta_locked_xattrop_validate(afr_private_t *priv, afr_local_t *local, + gf_boolean_t *valid) +{ + if (priv->ta_event_gen > local->ta_event_gen) { + /* We can't trust the ta's response anymore.*/ + afr_ta_locked_priv_invalidate(priv); + *valid = _gf_false; + return; + } + return; +} - if (op_ret == 1) { +static int +afr_ta_post_op_do(void *opaque) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t *this = NULL; + dict_t *xattr = NULL; + unsigned char *pending = NULL; + int **changelog = NULL; + int failed_subvol = -1; + int success_subvol = -1; + loc_t loc = { + 0, + }; + int i = 0; + int ret = 0; + gf_boolean_t valid = _gf_true; + + local = (afr_local_t *)opaque; + this = local->transaction.frame->this; + priv = this->private; + + ret = afr_fill_ta_loc(this, &loc, _gf_true); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate loc for thin-arbiter."); + goto out; + } + + xattr = dict_new(); + if (!xattr) { + ret = -ENOMEM; + goto out; + } + + pending = alloca0(priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) { + pending[i] = 1; + failed_subvol = i; + } else { + success_subvol = i; } - - if (op_ret == 0) { - __mark_pre_op_undone_on_fd (frame, this, child_index); + } + + changelog = afr_set_changelog_xattr(priv, pending, xattr, local); + + if (!changelog) { + ret = -ENOMEM; + goto out; + } + + ret = afr_ta_post_op_lock(this, &loc); + if (ret) + goto out; + + ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Post-op on thin-arbiter id file %s failed for gfid %s.", + priv->pending_key[THIN_ARBITER_BRICK_INDEX], + uuid_utoa(local->inode->gfid)); + } + LOCK(&priv->lock); + { + if (ret == 0) { + priv->ta_bad_child_index = failed_subvol; + } else if (ret == -EINVAL) { + priv->ta_bad_child_index = success_subvol; + ret = -EIO; /* TA failed the fop. Return EIO to application. */ } - LOCK (&frame->lock); - { - call_count = --local->call_count; - } - UNLOCK (&frame->lock); + afr_ta_locked_xattrop_validate(priv, local, &valid); + } + UNLOCK(&priv->lock); + if (valid == _gf_false) { + gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB, + "Post-op on thin-arbiter id file %s for gfid %s invalidated due " + "to event-gen mismatch.", + priv->pending_key[THIN_ARBITER_BRICK_INDEX], + uuid_utoa(local->inode->gfid)); + ret = -EIO; + } + + afr_ta_post_op_unlock(this, &loc); +out: + if (xattr) + dict_unref(xattr); - if (call_count == 0) { - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); - } else { - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - } - } + if (changelog) + afr_matrix_cleanup(changelog, priv->child_count); - return 0; -} + loc_wipe(&loc); + return ret; +} -void -afr_update_read_child (call_frame_t *frame, xlator_t *this, inode_t *inode, - afr_transaction_type type) +static int +afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local) { - int curr_read_child = -1; - int new_read_child = -1; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int **pending = NULL; - int idx = 0; + call_frame_t *ta_frame = NULL; + int ret = 0; + + ta_frame = afr_ta_frame_create(this); + if (!ta_frame) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to create ta_frame"); + goto err; + } + ret = synctask_new(this->ctx->env, afr_ta_post_op_do, afr_ta_post_op_done, + ta_frame, local); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to launch post-op on thin arbiter for gfid %s", + uuid_utoa(local->inode->gfid)); + STACK_DESTROY(ta_frame->root); + goto err; + } + + return ret; +err: + afr_changelog_post_op_fail(local->transaction.frame, this, ENOMEM); + return ret; +} - idx = afr_index_for_transaction_type (type); +static void +afr_ta_set_fop_state(afr_private_t *priv, afr_local_t *local, + int *on_wire_count) +{ + LOCK(&priv->lock); + { + if (priv->release_ta_notify_dom_lock == _gf_true) { + /* Put the fop in waitq until notify dom lock is released.*/ + local->fop_state = TA_WAIT_FOR_NOTIFY_LOCK_REL; + list_add_tail(&local->ta_waitq, &priv->ta_waitq); + } else if (priv->ta_bad_child_index == AFR_CHILD_UNKNOWN) { + /* Post-op on thin-arbiter to decide success/failure. */ + local->fop_state = TA_GET_INFO_FROM_TA_FILE; + *on_wire_count = ++priv->ta_on_wire_txn_count; + if (*on_wire_count > 1) { + /*Avoid sending multiple on-wire post-ops on TA*/ + list_add_tail(&local->ta_onwireq, &priv->ta_onwireq); + } + } else if (local->ta_failed_subvol == priv->ta_bad_child_index) { + /* Post-op on TA not needed as the fop failed on the in-memory bad + * brick. Just mark pending xattrs on the good data brick.*/ + local->fop_state = TA_INFO_IN_MEMORY_SUCCESS; + priv->ta_in_mem_txn_count++; + } else { + /* Post-op on TA not needed as the fop succeeded only on the + * in-memory bad data brick and not the good one. Fail the fop.*/ + local->fop_state = TA_INFO_IN_MEMORY_FAILED; + priv->ta_in_mem_txn_count++; + } + } + UNLOCK(&priv->lock); +} - priv = this->private; - local = frame->local; - curr_read_child = afr_read_child (this, inode); - pending = local->pending; +static void +afr_ta_fill_failed_subvol(afr_private_t *priv, afr_local_t *local) +{ + int i = 0; - if (pending[curr_read_child][idx] != 0) - return; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) { + local->ta_failed_subvol = i; + break; + } + } +} - /* need to set new read_child */ - for (new_read_child = 0; new_read_child < priv->child_count; - new_read_child++) { +static void +afr_post_op_handle_success(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; - if (!priv->child_up[new_read_child]) - /* child is down */ - continue; + local = frame->local; + if (local->is_new_entry == _gf_true) { + afr_mark_new_entry_changelog(frame, this); + } + afr_changelog_post_op_do(frame, this); - if (pending[new_read_child][idx] == 0) - /* op just failed */ - continue; + return; +} - break; - } +static void +afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno) +{ + afr_changelog_post_op_fail(frame, this, op_errno); - if (new_read_child == priv->child_count) - /* all children uneligible. leave as-is */ - return; + return; +} - afr_set_read_child (this, inode, new_read_child); +static void +afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int on_wire_count = 0; + + priv = this->private; + local = frame->local; + + afr_ta_set_fop_state(priv, local, &on_wire_count); + + switch (local->fop_state) { + case TA_GET_INFO_FROM_TA_FILE: + if (on_wire_count == 1) + afr_ta_post_op_synctask(this, local); + /*else, fop is queued in ta_onwireq.*/ + break; + case TA_WAIT_FOR_NOTIFY_LOCK_REL: + /*Post releasing the notify lock, we will act on this queue*/ + break; + case TA_INFO_IN_MEMORY_SUCCESS: + afr_post_op_handle_success(frame, this); + break; + case TA_INFO_IN_MEMORY_FAILED: + afr_post_op_handle_failure(frame, this, EIO); + break; + default: + break; + } + return; } +static void +afr_handle_failure_using_thin_arbiter(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + + afr_ta_fill_failed_subvol(priv, local); + gf_msg_debug(this->name, 0, + "Fop failed on data brick (%s) for gfid=%s. " + "ta info needed to decide fop result.", + priv->children[local->ta_failed_subvol]->name, + uuid_utoa(local->inode->gfid)); + afr_ta_decide_post_op_state(frame, this); +} -int -afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +void +afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = this->private; - afr_internal_lock_t *int_lock = NULL; - int ret = 0; - int i = 0; - int call_count = 0; + afr_private_t *priv = this->private; + afr_local_t *local = NULL; + dict_t *xattr = NULL; + int i = 0; + int ret = 0; + int idx = 0; + int nothing_failed = 1; + gf_boolean_t need_undirty = _gf_false; + + afr_handle_quorum(frame, this); + local = frame->local; + idx = afr_index_for_transaction_type(local->transaction.type); + + xattr = dict_new(); + if (!xattr) { + afr_changelog_post_op_fail(frame, this, ENOMEM); + goto out; + } + + nothing_failed = afr_txn_nothing_failed(frame, this); + + if (afr_changelog_pre_op_uninherit(frame, this)) + need_undirty = _gf_false; + else + need_undirty = _gf_true; + + if (local->op_ret < 0 && !nothing_failed) { + if (afr_need_dirty_marking(frame, this)) { + local->dirty[idx] = hton32(1); + goto set_dirty; + } - afr_local_t * local = NULL; - afr_fd_ctx_t *fdctx = NULL; - dict_t **xattr = NULL; - int piggyback = 0; - int index = 0; - int nothing_failed = 1; + afr_changelog_post_op_done(frame, this); + goto out; + } + + if (nothing_failed && !need_undirty) { + afr_changelog_post_op_done(frame, this); + goto out; + } + + if (local->transaction.in_flight_sb) { + afr_changelog_post_op_fail(frame, this, + local->transaction.in_flight_sb_errno); + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + local->pending[i][idx] = hton32(1); + } + + ret = afr_set_pending_dict(priv, xattr, local->pending); + if (ret < 0) { + afr_changelog_post_op_fail(frame, this, ENOMEM); + goto out; + } + + if (need_undirty) + local->dirty[idx] = hton32(-1); + else + local->dirty[idx] = hton32(0); + +set_dirty: + ret = dict_set_static_bin(xattr, AFR_DIRTY, local->dirty, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + afr_changelog_post_op_fail(frame, this, ENOMEM); + goto out; + } + + afr_changelog_do(frame, this, xattr, afr_changelog_post_op_done, + AFR_TRANSACTION_POST_OP); +out: + if (xattr) + dict_unref(xattr); - local = frame->local; - int_lock = &local->internal_lock; + return; +} - __mark_down_children (local->pending, priv->child_count, - local->child_up, local->transaction.type); +static int +afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int failed_count = 0; + + priv = this->private; + local = frame->local; + + if (priv->thin_arbiter_count) { + failed_count = AFR_COUNT(local->transaction.failed_subvols, + priv->child_count); + if (failed_count == 1) { + afr_handle_failure_using_thin_arbiter(frame, this); + return 0; + } else { + /* Txn either succeeded or failed on both data bricks. Let + * post_op_do handle it as the case might be. */ + } + } - if (local->fd) - afr_update_read_child (frame, this, local->fd->inode, - local->transaction.type); + afr_changelog_post_op_do(frame, this); + return 0; +} - xattr = alloca (priv->child_count * sizeof (*xattr)); - memset (xattr, 0, (priv->child_count * sizeof (*xattr))); - for (i = 0; i < priv->child_count; i++) { - xattr[i] = get_new_dict (); - dict_ref (xattr[i]); +gf_boolean_t +afr_changelog_pre_op_uninherit(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_inode_ctx_t *ctx = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + int type = 0; + + local = frame->local; + priv = this->private; + ctx = local->inode_ctx; + + type = afr_index_for_transaction_type(local->transaction.type); + if (type != AFR_DATA_TRANSACTION) + return !local->transaction.dirtied; + + if (local->transaction.no_uninherit) + return _gf_false; + + /* This function must be idempotent. So check if we + were called before and return the same answer again. + + It is important to keep this function idempotent for + the call in afr_changelog_post_op_safe() to not have + side effects on the call from afr_changelog_post_op_now() + */ + if (local->transaction.uninherit_done) + return local->transaction.uninherit_value; + + LOCK(&local->inode->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] != ctx->pre_op_done[type][i]) { + ret = !local->transaction.dirtied; + goto unlock; + } } - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { - call_count *= 2; + if (ctx->inherited[type]) { + ret = _gf_true; + ctx->inherited[type]--; + } else if (ctx->on_disk[type]) { + ret = _gf_false; + ctx->on_disk[type]--; + } else { + /* ASSERT */ + ret = _gf_false; } - local->call_count = call_count; + if (!ctx->inherited[type] && !ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) + ctx->pre_op_done[type][i] = 0; + } + } +unlock: + UNLOCK(&local->inode->lock); - if (local->fd) - fdctx = afr_fd_ctx_get (local->fd, this); + local->transaction.uninherit_done = _gf_true; + local->transaction.uninherit_value = ret; - if (call_count == 0) { - /* no child is up */ - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } + return ret; +} - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - return 0; - } +gf_boolean_t +afr_changelog_pre_op_inherit(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + int type = 0; + + local = frame->local; + priv = this->private; + + if (local->transaction.type != AFR_DATA_TRANSACTION) + return _gf_false; + + type = afr_index_for_transaction_type(local->transaction.type); + + LOCK(&local->inode->lock); + { + if (!local->inode_ctx->on_disk[type]) { + /* nothing to inherit yet */ + ret = _gf_false; + goto unlock; + } - /* check if something has failed, to handle piggybacking */ - nothing_failed = 1; - index = afr_index_for_transaction_type (local->transaction.type); for (i = 0; i < priv->child_count; i++) { - if (local->pending[i][index] == 0) { - nothing_failed = 0; - break; - } + if (local->transaction.pre_op[i] != + local->inode_ctx->pre_op_done[type][i]) { + /* either inherit exactly, or don't */ + ret = _gf_false; + goto unlock; + } } - index = afr_index_for_transaction_type (local->transaction.type); - if (local->optimistic_change_log && - local->transaction.type != AFR_DATA_TRANSACTION) { - /* if nothing_failed, then local->pending[..] == {0 .. 0} */ - for (i = 0; i < priv->child_count; i++) - local->pending[i][index]++; - } + local->inode_ctx->inherited[type]++; - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; + ret = _gf_true; - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set pending entry"); - - - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - { - if (!fdctx) { - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - break; - } - - LOCK (&local->fd->lock); - { - piggyback = 0; - if (fdctx->pre_op_piggyback[i]) { - fdctx->pre_op_piggyback[i]--; - piggyback = 1; - } - } - UNLOCK (&local->fd->lock); - - if (piggyback && !nothing_failed) - ret = afr_set_piggyback_dict (priv, xattr[i], - local->pending, - local->transaction.type); - - if (nothing_failed && piggyback) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); - } else { - STACK_WIND_COOKIE (frame, - afr_changelog_post_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - } - break; - case AFR_METADATA_TRANSACTION: - { - if (nothing_failed) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); - break; - } - - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - break; + local->transaction.inherited = _gf_true; + } +unlock: + UNLOCK(&local->inode->lock); - case AFR_ENTRY_RENAME_TRANSACTION: - { - if (nothing_failed) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); - } else { - STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - call_count--; - } + return ret; +} - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ +gf_boolean_t +afr_changelog_pre_op_update(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + int type = 0; - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); + local = frame->local; + priv = this->private; - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set pending entry"); + if (local->transaction.type == AFR_ENTRY_TRANSACTION || + local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) + return _gf_false; - /* fall through */ + if (local->transaction.inherited) + /* was already inherited in afr_changelog_pre_op */ + return _gf_false; - case AFR_ENTRY_TRANSACTION: - { - if (nothing_failed) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); - break; - } - - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - break; - } + if (!local->transaction.dirtied) + return _gf_false; - if (!--call_count) - break; - } + if (!afr_txn_nothing_failed(frame, this)) + return _gf_false; - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); + type = afr_index_for_transaction_type(local->transaction.type); + + ret = _gf_false; + + LOCK(&local->inode->lock); + { + if (!local->inode_ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) + local->inode_ctx->pre_op_done[type][i] = + (!local->transaction.failed_subvols[i]); + } else { + for (i = 0; i < priv->child_count; i++) + if (local->inode_ctx->pre_op_done[type][i] != + (!local->transaction.failed_subvols[i])) { + local->transaction.no_uninherit = 1; + goto unlock; + } } + local->inode_ctx->on_disk[type]++; - return 0; -} + ret = _gf_true; + } +unlock: + UNLOCK(&local->inode->lock); + return ret; +} -int32_t -afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) +int +afr_changelog_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xattr, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = this->private; - loc_t * loc = NULL; + afr_local_t *local = NULL; + int call_count = -1; + int child_index = -1; - int call_count = -1; - int child_index = (long) cookie; + local = frame->local; + child_index = (long)cookie; - local = frame->local; - loc = &local->loc; + if (op_ret == -1) { + local->op_errno = op_errno; + afr_transaction_fop_failed(frame, this, child_index); + } - LOCK (&frame->lock); - { - if (op_ret == 1) { - /* special op_ret for piggyback */ - } + if (xattr) + local->transaction.changelog_xdata[child_index] = dict_ref(xattr); + + call_count = afr_frame_return(frame); + + if (call_count == 0) { + local->transaction.changelog_resume(frame, this); + } - if (op_ret == 0) { - __mark_pre_op_done_on_fd (frame, this, child_index); + return 0; +} + +void +afr_changelog_populate_xdata(call_frame_t *frame, afr_xattrop_type_t op, + dict_t **xdata, dict_t **newloc_xdata) +{ + int i = 0; + int ret = 0; + char *key = NULL; + int keylen = 0; + const char *name = NULL; + dict_t *xdata1 = NULL; + dict_t *xdata2 = NULL; + xlator_t *this = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_boolean_t need_entry_key_set = _gf_true; + + local = frame->local; + this = THIS; + priv = this->private; + + if (local->transaction.type == AFR_DATA_TRANSACTION || + local->transaction.type == AFR_METADATA_TRANSACTION) + goto out; + + if (!priv->esh_granular) + goto out; + + xdata1 = dict_new(); + if (!xdata1) + goto out; + + name = local->loc.name; + if (local->op == GF_FOP_LINK) + name = local->newloc.name; + + switch (op) { + case AFR_TRANSACTION_PRE_OP: + key = GF_XATTROP_ENTRY_IN_KEY; + break; + case AFR_TRANSACTION_POST_OP: + if (afr_txn_nothing_failed(frame, this)) { + key = GF_XATTROP_ENTRY_OUT_KEY; + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.failed_subvols[i]) + continue; + need_entry_key_set = _gf_false; + break; } + /* If the transaction itself did not fail and there + * are no failed subvolumes, check whether the fop + * failed due to a symmetric error. If it did, do + * not set the ENTRY_OUT xattr which would end up + * deleting a name index which was created possibly by + * an earlier entry txn that may have failed on some + * of the sub-volumes. + */ + if (local->op_ret) + need_entry_key_set = _gf_false; + } else { + key = GF_XATTROP_ENTRY_IN_KEY; + } + break; + } + + if (need_entry_key_set) { + keylen = strlen(key); + ret = dict_set_strn(xdata1, key, keylen, (char *)name); + if (ret) + gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED, + "%s/%s: Could not set %s key during xattrop", + uuid_utoa(local->loc.pargfid), local->loc.name, key); + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + xdata2 = dict_new(); + if (!xdata2) + goto out; - if (op_ret == -1) { - local->child_up[child_index] = 0; + ret = dict_set_strn(xdata2, key, keylen, + (char *)local->newloc.name); + if (ret) + gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED, + "%s/%s: Could not set %s key during " + "xattrop", + uuid_utoa(local->newloc.pargfid), local->newloc.name, + key); + } + } - if (op_errno == ENOTSUP) { - gf_log (this->name, GF_LOG_ERROR, - "xattrop not supported by %s", - priv->children[child_index]->name); - local->op_ret = -1; + *xdata = xdata1; + *newloc_xdata = xdata2; + xdata1 = xdata2 = NULL; +out: + if (xdata1) + dict_unref(xdata1); + return; +} - } else if (!child_went_down (op_ret, op_errno)) { - gf_log (this->name, GF_LOG_ERROR, - "xattrop failed on child %s: %s", - priv->children[child_index]->name, - strerror (op_errno)); - } - local->op_errno = op_errno; - } +int +afr_changelog_prepare(xlator_t *this, call_frame_t *frame, int *call_count, + afr_changelog_resume_t changelog_resume, + afr_xattrop_type_t op, dict_t **xdata, + dict_t **newloc_xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - call_count = --local->call_count; - } - UNLOCK (&frame->lock); + local = frame->local; + priv = this->private; - if (call_count == 0) { - if ((local->op_ret == -1) && - (local->op_errno == ENOTSUP)) { - local->transaction.resume (frame, this); - } else { - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + *call_count = afr_changelog_call_count( + local->transaction.type, local->transaction.pre_op, + local->transaction.failed_subvols, priv->child_count); - afr_pid_restore (frame); + if (*call_count == 0) { + changelog_resume(frame, this); + return -1; + } - local->transaction.fop (frame, this); - } - } + afr_changelog_populate_xdata(frame, op, xdata, newloc_xdata); + local->call_count = *call_count; - return 0; + local->transaction.changelog_resume = changelog_resume; + return 0; } - int -afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) +afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume, afr_xattrop_type_t op) { - afr_private_t * priv = this->private; - int i = 0; - int ret = 0; - int call_count = 0; - dict_t **xattr = NULL; - afr_fd_ctx_t *fdctx = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + dict_t *newloc_xdata = NULL; + int i = 0; + int call_count = 0; + int ret = 0; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.changelog_xdata[i]) { + dict_unref(local->transaction.changelog_xdata[i]); + local->transaction.changelog_xdata[i] = NULL; + } + } + + ret = afr_changelog_prepare(this, frame, &call_count, changelog_resume, op, + &xdata, &newloc_xdata); - afr_local_t *local = NULL; - int piggyback = 0; + if (ret) + return 0; - local = frame->local; + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i] || + local->transaction.failed_subvols[i]) + continue; + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + if (!local->fd) { + STACK_WIND_COOKIE( + frame, afr_changelog_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->xattrop, + &local->loc, GF_XATTROP_ADD_ARRAY, xattr, xdata); + } else { + STACK_WIND_COOKIE( + frame, afr_changelog_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->fxattrop, + local->fd, GF_XATTROP_ADD_ARRAY, xattr, xdata); + } + break; + case AFR_ENTRY_RENAME_TRANSACTION: - xattr = alloca (priv->child_count * sizeof (*xattr)); - memset (xattr, 0, (priv->child_count * sizeof (*xattr))); + STACK_WIND_COOKIE(frame, afr_changelog_cbk, (void *)(long)i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr, newloc_xdata); + call_count--; - for (i = 0; i < priv->child_count; i++) { - xattr[i] = get_new_dict (); - dict_ref (xattr[i]); + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + if (local->fd) + STACK_WIND_COOKIE( + frame, afr_changelog_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->fxattrop, + local->fd, GF_XATTROP_ADD_ARRAY, xattr, xdata); + else + STACK_WIND_COOKIE(frame, afr_changelog_cbk, (void *)(long)i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr, xdata); + break; } - call_count = afr_up_children_count (priv->child_count, - local->child_up); + if (!--call_count) + break; + } - if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { - call_count *= 2; - } + if (xdata) + dict_unref(xdata); + if (newloc_xdata) + dict_unref(newloc_xdata); + return 0; +} - if (call_count == 0) { - /* no child is up */ - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } +static void +afr_init_optimistic_changelog_for_txn(xlator_t *this, afr_local_t *local) +{ + int locked_count = 0; + afr_private_t *priv = NULL; - local->internal_lock.lock_cbk = - local->transaction.done; - afr_unlock (frame, this); - return 0; - } + priv = this->private; - local->call_count = call_count; + locked_count = AFR_COUNT(local->transaction.pre_op, priv->child_count); + if (priv->optimistic_change_log && locked_count == priv->child_count) + local->optimistic_change_log = 1; - __mark_all_pending (local->pending, priv->child_count, - local->transaction.type); + return; +} - if (local->fd) - fdctx = afr_fd_ctx_get (local->fd, this); +int +afr_changelog_pre_op(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = this->private; + int i = 0; + int ret = 0; + int call_count = 0; + int op_errno = 0; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + unsigned char *locked_nodes = NULL; + int idx = -1; + gf_boolean_t pre_nop = _gf_true; + dict_t *xdata_req = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + idx = afr_index_for_transaction_type(local->transaction.type); + + locked_nodes = afr_locked_nodes_get(local->transaction.type, int_lock); + + for (i = 0; i < priv->child_count; i++) { + if (locked_nodes[i]) { + local->transaction.pre_op[i] = 1; + call_count++; + } else { + local->transaction.failed_subvols[i] = 1; + } + } + + afr_init_optimistic_changelog_for_txn(this, local); + + if (afr_changelog_pre_op_inherit(frame, this)) + goto next; + + /* This condition should not be met with present code, as + * transaction.done will be called if locks are not acquired on even a + * single node. + */ + if (call_count == 0) { + op_errno = ENOTCONN; + goto err; + } + + /* Check if the fop can be performed on at least + * quorum number of nodes. + */ + if (priv->quorum_count && !afr_has_fop_quorum(frame)) { + op_errno = int_lock->lock_op_errno; + if (op_errno == 0) + op_errno = afr_quorum_errno(priv); + goto err; + } + + xdata_req = dict_new(); + if (!xdata_req) { + op_errno = ENOMEM; + goto err; + } + + if (call_count < priv->child_count) + pre_nop = _gf_false; + + /* Set an all-zero pending changelog so that in the cbk, we can get the + * current on-disk values. In a replica 3 volume with arbiter enabled, + * these values are needed to arrive at a go/ no-go of the fop phase to + * avoid ending up in split-brain.*/ + + ret = afr_set_pending_dict(priv, xdata_req, local->pending); + if (ret < 0) { + op_errno = ENOMEM; + goto err; + } + + if (afr_needs_changelog_update(local)) { + local->dirty[idx] = hton32(1); + + ret = dict_set_static_bin(xdata_req, AFR_DIRTY, local->dirty, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + op_errno = ENOMEM; + goto err; + } - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set pending entry"); - - - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - { - if (!fdctx) { - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i]); - break; - } - - LOCK (&local->fd->lock); - { - piggyback = 0; - if (fdctx->pre_op_done[i]) { - fdctx->pre_op_piggyback[i]++; - piggyback = 1; - fdctx->hit++; - } else { - fdctx->miss++; - } - } - UNLOCK (&local->fd->lock); - - if (piggyback) - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - break; - case AFR_METADATA_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); - break; - } - - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - break; + pre_nop = _gf_false; + local->transaction.dirtied = 1; + } - case AFR_ENTRY_RENAME_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); - } else { - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - - call_count--; - } + if (pre_nop) + goto next; + if (!local->pre_op_compat) { + dict_copy(xdata_req, local->xdata_req); + goto next; + } - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ + afr_changelog_do(frame, this, xdata_req, afr_transaction_perform_fop, + AFR_TRANSACTION_PRE_OP); - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); + if (xdata_req) + dict_unref(xdata_req); - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set pending entry"); + return 0; +next: + afr_transaction_perform_fop(frame, this); - /* fall through */ + if (xdata_req) + dict_unref(xdata_req); - case AFR_ENTRY_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); - break; - } - - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - break; - } + return 0; +err: + local->internal_lock.lock_cbk = afr_transaction_done; + local->op_ret = -1; + local->op_errno = op_errno; - if (!--call_count) - break; - } + afr_handle_lock_acquire_failure(local); - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } + if (xdata_req) + dict_unref(xdata_req); - return 0; + return 0; } +int +afr_post_nonblocking_lock_cbk(call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + + /* Initiate blocking locks if non-blocking has failed */ + if (int_lock->lock_op_ret < 0) { + gf_msg_debug(this->name, 0, + "Non blocking locks failed. Proceeding to blocking"); + int_lock->lock_cbk = afr_internal_lock_finish; + afr_blocking_lock(frame, this); + } else { + gf_msg_debug(this->name, 0, + "Non blocking locks done. Proceeding to FOP"); + + afr_internal_lock_finish(frame, this); + } + + return 0; +} int -afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) +afr_post_blocking_rename_cbk(call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; - local = frame->local; - int_lock = &local->internal_lock; + local = frame->local; + int_lock = &local->internal_lock; - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Blocking inodelks failed."); - local->transaction.done (frame, this); - } else { + if (int_lock->lock_op_ret < 0) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INTERNAL_LKS_FAILED, + "Blocking entrylks failed."); - gf_log (this->name, GF_LOG_DEBUG, - "Blocking inodelks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); - } + afr_transaction_done(frame, this); + } else { + gf_msg_debug(this->name, 0, + "Blocking entrylks done. Proceeding to FOP"); - return 0; + afr_internal_lock_finish(frame, this); + } + return 0; } - int -afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) +afr_post_lower_unlock_cbk(call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; - local = frame->local; - int_lock = &local->internal_lock; + local = frame->local; + int_lock = &local->internal_lock; - /* Initiate blocking locks if non-blocking has failed */ - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking inodelks failed. Proceeding to blocking"); - int_lock->lock_cbk = afr_post_blocking_inodelk_cbk; - afr_blocking_lock (frame, this); - } else { + GF_ASSERT(!int_lock->higher_locked); - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking inodelks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); - } + int_lock->lock_cbk = afr_post_blocking_rename_cbk; + afr_blocking_lock(frame, this); - return 0; + return 0; } +int +afr_set_transaction_flock(xlator_t *this, afr_local_t *local, + afr_lockee_t *lockee) +{ + afr_private_t *priv = NULL; + struct gf_flock *flock = NULL; + + priv = this->private; + flock = &lockee->flock; + + if ((priv->arbiter_count || local->transaction.eager_lock_on || + priv->full_lock) && + local->transaction.type == AFR_DATA_TRANSACTION) { + /*Lock entire file to avoid network split brains.*/ + flock->l_len = 0; + flock->l_start = 0; + } else { + flock->l_len = local->transaction.len; + flock->l_start = local->transaction.start; + } + flock->l_type = F_WRLCK; + + return 0; +} int -afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) +afr_lock(call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + int i = 0; + + local = frame->local; + int_lock = &local->internal_lock; + + int_lock->lock_cbk = afr_post_nonblocking_lock_cbk; + int_lock->domain = this->name; + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + for (i = 0; i < int_lock->lockee_count; i++) { + afr_set_transaction_flock(this, local, &int_lock->lockee[i]); + } + + break; + + case AFR_ENTRY_TRANSACTION: + int_lock->lk_basename = local->transaction.basename; + if (local->transaction.parent_loc.path) + int_lock->lk_loc = &local->transaction.parent_loc; + else + GF_ASSERT(local->fd); + break; + case AFR_ENTRY_RENAME_TRANSACTION: + break; + } + afr_lock_nonblocking(frame, this); - local = frame->local; - int_lock = &local->internal_lock; + return 0; +} - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Blocking entrylks failed."); - local->transaction.done (frame, this); - } else { +static gf_boolean_t +afr_locals_overlap(afr_local_t *local1, afr_local_t *local2) +{ + uint64_t start1 = local1->transaction.start; + uint64_t start2 = local2->transaction.start; + uint64_t end1 = 0; + uint64_t end2 = 0; + + if (local1->transaction.len) + end1 = start1 + local1->transaction.len - 1; + else + end1 = ULLONG_MAX; + + if (local2->transaction.len) + end2 = start2 + local2->transaction.len - 1; + else + end2 = ULLONG_MAX; + + return ((end1 >= start2) && (end2 >= start1)); +} - gf_log (this->name, GF_LOG_DEBUG, - "Blocking entrylks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); +gf_boolean_t +afr_has_lock_conflict(afr_local_t *local, gf_boolean_t waitlist_check) +{ + afr_local_t *each = NULL; + afr_lock_t *lock = NULL; + + lock = &local->inode_ctx->lock[local->transaction.type]; + /* + * Once full file lock is acquired in eager-lock phase, overlapping + * writes do not compete for inode-locks, instead are transferred to the + * next writes. Because of this overlapping writes are not ordered. + * This can cause inconsistencies in replication. + * Example: + * Two overlapping writes w1, w2 are sent in parallel on same fd + * in two threads t1, t2. + * Both threads can execute afr_writev_wind in the following manner. + * t1 winds w1 on brick-0 + * t2 winds w2 on brick-0 + * t2 winds w2 on brick-1 + * t1 winds w1 on brick-1 + * + * This check makes sure the locks are not transferred for + * overlapping writes. + */ + list_for_each_entry(each, &lock->owners, transaction.owner_list) + { + if (afr_locals_overlap(each, local)) { + return _gf_true; } - - return 0; + } + + if (!waitlist_check) + return _gf_false; + list_for_each_entry(each, &lock->waiting, transaction.wait_list) + { + if (afr_locals_overlap(each, local)) { + return _gf_true; + } + } + return _gf_false; } +/* }}} */ +static void +afr_copy_inodelk_vars(afr_internal_lock_t *dst, afr_internal_lock_t *src, + xlator_t *this, int lockee_num) +{ + afr_private_t *priv = this->private; + afr_lockee_t *sl = &src->lockee[lockee_num]; + afr_lockee_t *dl = &dst->lockee[lockee_num]; + + dst->domain = src->domain; + dl->flock.l_len = sl->flock.l_len; + dl->flock.l_start = sl->flock.l_start; + dl->flock.l_type = sl->flock.l_type; + dl->locked_count = sl->locked_count; + memcpy(dl->locked_nodes, sl->locked_nodes, + priv->child_count * sizeof(*dl->locked_nodes)); +} -int -afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) +void +__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + gf_boolean_t conflict = _gf_false; + afr_local_t *each = NULL; + afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type]; + + while (!conflict) { + if (list_empty(&lock->waiting)) + return; + each = list_entry(lock->waiting.next, afr_local_t, + transaction.wait_list); + if (afr_has_lock_conflict(each, _gf_false)) { + conflict = _gf_true; + } + if (conflict && !list_empty(&lock->owners)) + return; + afr_copy_inodelk_vars(&each->internal_lock, &local->internal_lock, + each->transaction.frame->this, 0); + list_move_tail(&each->transaction.wait_list, shared); + list_add_tail(&each->transaction.owner_list, &lock->owners); + } +} - local = frame->local; - int_lock = &local->internal_lock; +static void +afr_lock_resume_shared(struct list_head *list) +{ + afr_local_t *each = NULL; + + while (!list_empty(list)) { + each = list_entry(list->next, afr_local_t, transaction.wait_list); + list_del_init(&each->transaction.wait_list); + afr_changelog_pre_op(each->transaction.frame, + each->transaction.frame->this); + } +} - /* Initiate blocking locks if non-blocking has failed */ - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks failed. Proceeding to blocking"); - int_lock->lock_cbk = afr_post_blocking_entrylk_cbk; - afr_blocking_lock (frame, this); +int +afr_internal_lock_finish(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = frame->local; + afr_lock_t *lock = NULL; + + local->internal_lock.lock_cbk = NULL; + if (!local->transaction.eager_lock_on) { + if (local->internal_lock.lock_op_ret < 0) { + afr_transaction_done(frame, this); + return 0; + } + afr_changelog_pre_op(frame, this); + } else { + lock = &local->inode_ctx->lock[local->transaction.type]; + if (local->internal_lock.lock_op_ret < 0) { + afr_handle_lock_acquire_failure(local); } else { - - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); + lock->event_generation = local->event_generation; + afr_changelog_pre_op(frame, this); } + } - return 0; + return 0; } - -int -afr_post_blocking_rename_cbk (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_are_conflicting_ops_waiting(afr_local_t *local, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + afr_lock_t *lock = NULL; + lock = &local->inode_ctx->lock[local->transaction.type]; + + /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock + * is taken mount2 opened the same file, it won't be able to + * perform any {meta,}data operations until mount1 releases eager-lock. + * To avoid such scenario do not enable eager-lock for this transaction + * if open-fd-count is > 1 for metadata transactions and if num-inodelks > 1 + * for data transactions + */ + + if (local->transaction.type == AFR_METADATA_TRANSACTION) { + if (local->inode_ctx->open_fd_count > 1) { + return _gf_true; + } + } else if (local->transaction.type == AFR_DATA_TRANSACTION) { + if (lock->num_inodelks > 1) { + return _gf_true; + } + } - local = frame->local; - int_lock = &local->internal_lock; + return _gf_false; +} - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Blocking entrylks failed."); - local->transaction.done (frame, this); - } else { +gf_boolean_t +afr_is_delayed_changelog_post_op_needed(call_frame_t *frame, xlator_t *this, + int delay) +{ + afr_local_t *local = NULL; + afr_lock_t *lock = NULL; + gf_boolean_t res = _gf_false; + + local = frame->local; + lock = &local->inode_ctx->lock[local->transaction.type]; + + if (!afr_txn_nothing_failed(frame, this)) { + lock->release = _gf_true; + goto out; + } + + if (afr_are_conflicting_ops_waiting(local, this)) { + lock->release = _gf_true; + goto out; + } + + if (!list_empty(&lock->owners)) + goto out; + else + GF_ASSERT(list_empty(&lock->waiting)); + + if (lock->release) { + goto out; + } + + if (!delay) { + goto out; + } + + if (local->transaction.disable_delayed_post_op) { + goto out; + } + + if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP) && + (local->op != GF_FOP_FSYNC)) { + /*Only allow writes/fsyncs but shard does [f]xattrops on writes, so + * they are fine too*/ + goto out; + } + + res = _gf_true; +out: + return res; +} - gf_log (this->name, GF_LOG_DEBUG, - "Blocking entrylks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); +void +afr_delayed_changelog_wake_up_cbk(void *data) +{ + afr_lock_t *lock = NULL; + afr_local_t *local = data; + afr_local_t *timer_local = NULL; + struct list_head shared; + + INIT_LIST_HEAD(&shared); + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK(&local->inode->lock); + { + timer_local = list_entry(lock->post_op.next, afr_local_t, + transaction.owner_list); + if (list_empty(&lock->owners) && (local == timer_local)) { + GF_ASSERT(list_empty(&lock->waiting)); + /*Last owner*/ + lock->release = _gf_true; + lock->delay_timer = NULL; } - return 0; + } + UNLOCK(&local->inode->lock); + afr_changelog_post_op_now(local->transaction.frame, + local->transaction.frame->this); } - +/* SET operation */ int -afr_post_lower_unlock_cbk (call_frame_t *frame, xlator_t *this) +afr_fd_report_unstable_write(xlator_t *this, afr_local_t *local) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; + LOCK(&local->inode->lock); + { + local->inode_ctx->witnessed_unstable_write = _gf_true; + } + UNLOCK(&local->inode->lock); - GF_ASSERT (!int_lock->higher_locked); - - int_lock->lock_cbk = afr_post_blocking_rename_cbk; - afr_blocking_lock (frame, this); - - return 0; + return 0; } - -int -afr_set_transaction_flock (afr_local_t *local) +/* TEST and CLEAR operation */ +gf_boolean_t +afr_fd_has_witnessed_unstable_write(xlator_t *this, inode_t *inode) { - afr_internal_lock_t *int_lock = NULL; + afr_inode_ctx_t *ctx = NULL; + gf_boolean_t witness = _gf_false; - int_lock = &local->internal_lock; + LOCK(&inode->lock); + { + (void)__afr_inode_ctx_get(this, inode, &ctx); - int_lock->lk_flock.l_len = local->transaction.len; - int_lock->lk_flock.l_start = local->transaction.start; - int_lock->lk_flock.l_type = F_WRLCK; + if (ctx->witnessed_unstable_write) { + witness = _gf_true; + ctx->witnessed_unstable_write = _gf_false; + } + } + UNLOCK(&inode->lock); - return 0; + return witness; } int -afr_lock_rec (call_frame_t *frame, xlator_t *this) +afr_changelog_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = (long)cookie; + int call_count = -1; + afr_local_t *local = NULL; - local = frame->local; - int_lock = &local->internal_lock; + priv = this->private; + local = frame->local; - int_lock->transaction_lk_type = AFR_TRANSACTION_LK; + if (op_ret != 0) { + /* Failure of fsync() is as good as failure of previous + write(). So treat it like one. + */ + gf_msg(this->name, GF_LOG_WARNING, op_errno, AFR_MSG_FSYNC_FAILED, + "fsync(%s) failed on subvolume %s. Transaction was %s", + uuid_utoa(local->fd->inode->gfid), + priv->children[child_index]->name, gf_fop_list[local->op]); - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - afr_set_transaction_flock (local); + afr_transaction_fop_failed(frame, this, child_index); + } - int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk; + call_count = afr_frame_return(frame); - afr_nonblocking_inodelk (frame, this); - break; + if (call_count == 0) + afr_changelog_post_op_now(frame, this); - case AFR_ENTRY_RENAME_TRANSACTION: + return 0; +} - int_lock->lock_cbk = afr_post_blocking_rename_cbk; - afr_blocking_lock (frame, this); - break; +int +afr_changelog_fsync(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + GF_UNUSED int ret = -1; - case AFR_ENTRY_TRANSACTION: - int_lock->lk_basename = local->transaction.basename; - if (&local->transaction.parent_loc) - int_lock->lk_loc = &local->transaction.parent_loc; - else - GF_ASSERT (local->fd); + local = frame->local; + priv = this->private; - int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk; - afr_nonblocking_entrylk (frame, this); - break; - } + call_count = AFR_COUNT(local->transaction.pre_op, priv->child_count); + if (!call_count) { + /* will go straight to unlock */ + afr_changelog_post_op_now(frame, this); return 0; -} + } + local->call_count = call_count; -int -afr_lock (call_frame_t *frame, xlator_t *this) -{ - afr_pid_save (frame); + xdata = dict_new(); + if (xdata) { + ret = dict_set_int32_sizen(xdata, "batch-fsync", 1); + ret = dict_set_str(xdata, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + } - frame->root->pid = (long) frame->root; + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; - afr_set_lk_owner (frame, this); + STACK_WIND_COOKIE(frame, afr_changelog_fsync_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->fsync, + local->fd, 1, xdata); + if (!--call_count) + break; + } - afr_set_lock_number (frame, this); + if (xdata) + dict_unref(xdata); - return afr_lock_rec (frame, this); + return 0; } - -/* }}} */ - int -afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) +afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - priv = this->private; - local = frame->local; + local = frame->local; + priv = this->private; - if (__changelog_needed_pre_op (frame, this)) { - afr_changelog_pre_op (frame, this); - } else { - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { + afr_changelog_post_op_now(frame, this); + return 0; + } + + if (afr_changelog_pre_op_uninherit(frame, this) && + afr_txn_nothing_failed(frame, this)) { + /* just detected that this post-op is about to + be optimized away as a new write() has + already piggybacked on this frame's changelog. + */ + afr_changelog_post_op_now(frame, this); + return 0; + } + + /* Calling afr_changelog_post_op_now() now will result in + issuing ->[f]xattrop(). + + Performing a hard POST-OP (->[f]xattrop() FOP) is a more + responsible operation that what it might appear on the surface. + + The changelog of a file (in the xattr of the file on the server) + stores information (pending count) about the state of the file + on the OTHER server. This changelog is blindly trusted, and must + therefore be updated in such a way it remains trustworthy. This + implies that decrementing the pending count (essentially "clearing + the dirty flag") must be done STRICTLY after we are sure that the + operation on the other server has reached stable storage. + + While the backend filesystem on that server will eventually flush + it to stable storage, we (being in userspace) have no mechanism + to get notified when the write became "stable". + + This means we need take matter into our own hands and issue an + fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, + and get an acknowledgement for it. And we need to wait for the + fsync() acknowledgement before initiating the hard POST-OP. + + However if the FD itself was opened in O_SYNC or O_DSYNC then + we are already guaranteed that the writes were made stable as + part of the FOP itself. The same holds true for NFS stable + writes which happen on an anonymous FD with O_DSYNC or O_SYNC + flag set in the writev() @flags param. For all other write types, + mark a flag in the fdctx whenever an unstable write is witnessed. + */ + + if (!afr_fd_has_witnessed_unstable_write(this, local->inode)) { + afr_changelog_post_op_now(frame, this); + return 0; + } + + /* Check whether users want durability and perform fsync/post-op + * accordingly. + */ + if (priv->ensure_durability) { + /* Time to fsync() */ + afr_changelog_fsync(frame, this); + } else { + afr_changelog_post_op_now(frame, this); + } + + return 0; +} - afr_pid_restore (frame); +void +afr_changelog_post_op(call_frame_t *frame, xlator_t *this) +{ + struct timespec delta = { + 0, + }; + afr_private_t *priv = NULL; + afr_local_t *local = frame->local; + afr_lock_t *lock = NULL; + gf_boolean_t post_op = _gf_true; + struct list_head shared; + + priv = this->private; + delta.tv_sec = priv->post_op_delay_secs; + delta.tv_nsec = 0; + + INIT_LIST_HEAD(&shared); + if (!local->transaction.eager_lock_on) + goto out; + + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK(&local->inode->lock); + { + list_del_init(&local->transaction.owner_list); + list_add(&local->transaction.owner_list, &lock->post_op); + __afr_transaction_wake_shared(local, &shared); + + if (!afr_is_delayed_changelog_post_op_needed(frame, this, + delta.tv_sec)) { + if (list_empty(&lock->owners)) + lock->release = _gf_true; + goto unlock; + } - local->transaction.fop (frame, this); + GF_ASSERT(lock->delay_timer == NULL); + lock->delay_timer = gf_timer_call_after( + this->ctx, delta, afr_delayed_changelog_wake_up_cbk, local); + if (!lock->delay_timer) { + lock->release = _gf_true; + } else { + post_op = _gf_false; } + } +unlock: + UNLOCK(&local->inode->lock); - return 0; -} + if (!list_empty(&shared)) { + afr_lock_resume_shared(&shared); + } +out: + if (post_op) { + if (!local->transaction.eager_lock_on || lock->release) { + afr_changelog_post_op_safe(frame, this); + } else { + afr_changelog_post_op_now(frame, this); + } + } +} int -afr_transaction_resume (call_frame_t *frame, xlator_t *this) +afr_transaction_resume(call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; + local = frame->local; - if (__changelog_needed_post_op (frame, this)) { - afr_changelog_post_op (frame, this); - } else { - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); - } else { - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - } - } + afr_restore_lk_owner(frame); - return 0; -} + afr_handle_symmetric_errors(frame, this); + if (!local->pre_op_compat) + /* new mode, pre-op was done along + with OP */ + afr_changelog_pre_op_update(frame, this); + + afr_changelog_post_op(frame, this); + + return 0; +} /** * afr_transaction_fop_failed - inform that an fop failed */ void -afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index) +afr_transaction_fop_failed(call_frame_t *frame, xlator_t *this, int child_index) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + afr_local_t *local = NULL; - local = frame->local; - priv = this->private; + local = frame->local; - __mark_child_dead (local->pending, priv->child_count, - child_index, local->transaction.type); + local->transaction.failed_subvols[child_index] = 1; } +static gf_boolean_t +__need_previous_lock_unlocked(afr_local_t *local) +{ + afr_lock_t *lock = NULL; + + lock = &local->inode_ctx->lock[local->transaction.type]; + if (!lock->acquired) + return _gf_false; + if (lock->acquired && lock->event_generation != local->event_generation) + return _gf_true; + return _gf_false; +} -int -afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) +void +__afr_eager_lock_handle(afr_local_t *local, gf_boolean_t *take_lock, + gf_boolean_t *do_pre_op, afr_local_t **timer_local) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + afr_lock_t *lock = NULL; + afr_local_t *owner_local = NULL; + xlator_t *this = local->transaction.frame->this; + + local->transaction.eager_lock_on = _gf_true; + afr_set_lk_owner(local->transaction.frame, this, local->inode); + + lock = &local->inode_ctx->lock[local->transaction.type]; + if (__need_previous_lock_unlocked(local)) { + if (!list_empty(&lock->owners)) { + lock->release = _gf_true; + } else if (lock->delay_timer) { + lock->release = _gf_true; + if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) { + /* It will be put in frozen list + * in the code flow below*/ + } else { + *timer_local = list_entry(lock->post_op.next, afr_local_t, + transaction.owner_list); + lock->delay_timer = NULL; + } + } + } + + if (lock->release) { + list_add_tail(&local->transaction.wait_list, &lock->frozen); + *take_lock = _gf_false; + goto out; + } + + if (lock->delay_timer) { + *take_lock = _gf_false; + if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) { + list_add_tail(&local->transaction.wait_list, &lock->frozen); + } else { + *timer_local = list_entry(lock->post_op.next, afr_local_t, + transaction.owner_list); + afr_copy_inodelk_vars(&local->internal_lock, + &(*timer_local)->internal_lock, this, 0); + lock->delay_timer = NULL; + *do_pre_op = _gf_true; + list_add_tail(&local->transaction.owner_list, &lock->owners); + } + goto out; + } + + if (!list_empty(&lock->owners)) { + if (!lock->acquired || afr_has_lock_conflict(local, _gf_true)) { + list_add_tail(&local->transaction.wait_list, &lock->waiting); + *take_lock = _gf_false; + goto out; + } + owner_local = list_entry(lock->owners.next, afr_local_t, + transaction.owner_list); + afr_copy_inodelk_vars(&local->internal_lock, + &owner_local->internal_lock, this, 0); + *take_lock = _gf_false; + *do_pre_op = _gf_true; + } + + if (lock->acquired) + GF_ASSERT(!(*take_lock)); + list_add_tail(&local->transaction.owner_list, &lock->owners); +out: + return; +} - local = frame->local; - priv = this->private; +void +afr_transaction_start(afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = NULL; + gf_boolean_t take_lock = _gf_true; + gf_boolean_t do_pre_op = _gf_false; + afr_local_t *timer_local = NULL; + + priv = this->private; + + if (local->transaction.type != AFR_DATA_TRANSACTION && + local->transaction.type != AFR_METADATA_TRANSACTION) + goto lock_phase; + + if (!priv->eager_lock) + goto lock_phase; + + LOCK(&local->inode->lock); + { + __afr_eager_lock_handle(local, &take_lock, &do_pre_op, &timer_local); + } + UNLOCK(&local->inode->lock); +lock_phase: + if (!local->transaction.eager_lock_on) { + afr_set_lk_owner(local->transaction.frame, this, + local->transaction.frame->root); + } + + if (take_lock) { + afr_lock(local->transaction.frame, this); + } else if (do_pre_op) { + afr_changelog_pre_op(local->transaction.frame, this); + } + /*Always call delayed_changelog_wake_up_cbk after calling pre-op above + * so that any inheriting can happen*/ + if (timer_local) + afr_delayed_changelog_wake_up_cbk(timer_local); +} - afr_transaction_local_init (local, priv); +int +afr_write_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) +{ + afr_local_t *local = frame->local; + + if (err) { + AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(-1, err); + goto fail; + } + + afr_transaction_start(local, this); + return 0; +fail: + local->transaction.unwind(frame, this); + AFR_STACK_DESTROY(frame); + return 0; +} - local->transaction.resume = afr_transaction_resume; - local->transaction.type = type; +int +afr_transaction_lockee_init(call_frame_t *frame) +{ + afr_local_t *local = frame->local; + afr_internal_lock_t *int_lock = &local->internal_lock; + afr_private_t *priv = frame->this->private; + int ret = 0; + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + ret = afr_add_inode_lockee(local, priv->child_count); + break; + + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + ret = afr_add_entry_lockee(local, &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) { + goto out; + } + if (local->op == GF_FOP_RENAME) { + ret = afr_add_entry_lockee( + local, &local->transaction.new_parent_loc, + local->transaction.new_basename, priv->child_count); + if (ret) { + goto out; + } - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - afr_internal_lock_finish (frame, this); - } else { - afr_lock (frame, this); - } + if (local->newloc.inode && + IA_ISDIR(local->newloc.inode->ia_type)) { + ret = afr_add_entry_lockee(local, &local->newloc, NULL, + priv->child_count); + if (ret) { + goto out; + } + } + } else if (local->op == GF_FOP_RMDIR) { + ret = afr_add_entry_lockee(local, &local->loc, NULL, + priv->child_count); + if (ret) { + goto out; + } + } + + if (int_lock->lockee_count > 1) { + qsort(int_lock->lockee, int_lock->lockee_count, + sizeof(*int_lock->lockee), afr_entry_lockee_cmp); + } + break; + } +out: + return ret; +} - return 0; +int +afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int ret = -1; + int event_generation = 0; + + local = frame->local; + priv = this->private; + local->transaction.frame = frame; + + local->transaction.type = type; + + if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) { + ret = -afr_quorum_errno(priv); + goto out; + } + + if (!afr_is_consistent_io_possible(local, priv, &ret)) { + ret = -ret; /*op_errno to ret conversion*/ + goto out; + } + + if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) { + ret = -afr_quorum_errno(priv); + goto out; + } + + ret = afr_transaction_local_init(local, this); + if (ret < 0) + goto out; + + ret = afr_transaction_lockee_init(frame); + if (ret) + goto out; + + if (type != AFR_METADATA_TRANSACTION) { + goto txn_start; + } + + ret = afr_inode_get_readable(frame, local->inode, this, local->readable, + &event_generation, type); + if (ret < 0 || + afr_is_inode_refresh_reqd(local->inode, this, priv->event_generation, + event_generation)) { + afr_inode_refresh(frame, this, local->inode, local->loc.gfid, + afr_write_txn_refresh_done); + ret = 0; + goto out; + } + +txn_start: + ret = 0; + afr_transaction_start(local, this); +out: + return ret; } diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index 84cf31d633e..beefa26f4a6 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -1,33 +1,75 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __TRANSACTION_H__ #define __TRANSACTION_H__ +#include "afr.h" + void -afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, - int child_index); +afr_transaction_fop_failed(call_frame_t *frame, xlator_t *this, + int child_index); + +int32_t +afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type); int -afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); +afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int32_t **pending); -int32_t -afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); +void +afr_delayed_changelog_wake_up(xlator_t *this, fd_t *fd); + +void +__mark_all_success(call_frame_t *frame, xlator_t *this); + +gf_boolean_t +afr_txn_nothing_failed(call_frame_t *frame, xlator_t *this); + +int +afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_read_txn_wind_t readfn, afr_transaction_type type); + +int +afr_read_txn_continue(call_frame_t *frame, xlator_t *this, int subvol); + +void +afr_pending_read_increment(afr_private_t *priv, int child_index); +void +afr_pending_read_decrement(afr_private_t *priv, int child_index); + +call_frame_t * +afr_transaction_detach_fop_frame(call_frame_t *frame); +gf_boolean_t +afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame); +gf_boolean_t +afr_needs_changelog_update(afr_local_t *local); +void +afr_zero_fill_stat(afr_local_t *local); + +void +afr_pick_error_xdata(afr_local_t *local, afr_private_t *priv, inode_t *inode1, + unsigned char *readable1, inode_t *inode2, + unsigned char *readable2); +int +afr_transaction_resume(call_frame_t *frame, xlator_t *this); + +int +afr_lock(call_frame_t *frame, xlator_t *this); + +void +afr_delayed_changelog_wake_up_cbk(void *data); + +int +afr_release_notify_lock_for_ta(void *opaque); + +int +afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque); #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index dff55bc9584..df7366f0a65 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -24,837 +15,1330 @@ #include <stdlib.h> #include <signal.h> -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif #include "afr-common.c" +#include "afr-messages.h" + +struct volume_options options[]; + +static char *afr_favorite_child_policies[AFR_FAV_CHILD_POLICY_MAX + 1] = { + [AFR_FAV_CHILD_NONE] = "none", + [AFR_FAV_CHILD_BY_SIZE] = "size", + [AFR_FAV_CHILD_BY_CTIME] = "ctime", + [AFR_FAV_CHILD_BY_MTIME] = "mtime", + [AFR_FAV_CHILD_BY_MAJORITY] = "majority", + [AFR_FAV_CHILD_POLICY_MAX] = NULL, +}; int32_t -notify (xlator_t *this, int32_t event, - void *data, ...) +notify(xlator_t *this, int32_t event, void *data, ...) { - int ret = -1; + int ret = -1; + va_list ap; + void *data2 = NULL; - ret = afr_notify (this, event, data); + va_start(ap, data); + data2 = va_arg(ap, dict_t *); + va_end(ap); + ret = afr_notify(this, event, data, data2); - return ret; + return ret; } int32_t -mem_acct_init (xlator_t *this) +mem_acct_init(xlator_t *this) { - int ret = -1; + int ret = -1; - if (!this) - return ret; + if (!this) + return ret; - ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1); - - if (ret != 0) { - gf_log(this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } + ret = xlator_mem_acct_init(this, gf_afr_mt_end + 1); + if (ret != 0) { return ret; + } + + return ret; } int -validate_options (xlator_t *this, char **op_errstr) +xlator_subvolume_index(xlator_t *this, xlator_t *subvol) { - int ret = 0; - volume_opt_list_t *vol_opt = NULL; - volume_opt_list_t *tmp; + int index = -1; + int i = 0; + xlator_list_t *list = NULL; - if (!this) { - gf_log (this->name, GF_LOG_DEBUG, "'this' not a valid ptr"); - ret =-1; - goto out; + list = this->children; + + while (list) { + if (subvol == list->xlator || + strcmp(subvol->name, list->xlator->name) == 0) { + index = i; + break; } + list = list->next; + i++; + } - if (list_empty (&this->volume_options)) - goto out; + return index; +} - vol_opt = list_entry (this->volume_options.next, - volume_opt_list_t, list); - list_for_each_entry_safe (vol_opt, tmp, &this->volume_options, list) { - ret = validate_xlator_volume_options_attacherr (this, - vol_opt->given_opt, - op_errstr); - } +static void +fix_quorum_options(xlator_t *this, afr_private_t *priv, char *qtype, + dict_t *options) +{ + if (dict_get_sizen(options, "quorum-type") == NULL) { + /* If user doesn't configure anything enable auto-quorum if the + * replica has more than two subvolumes */ + if (priv->child_count > 2) + qtype = "auto"; + } + + if (priv->quorum_count && strcmp(qtype, "fixed")) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_OVERRIDE, + "quorum-type %s overriding quorum-count %u", qtype, + priv->quorum_count); + } + + if (!strcmp(qtype, "none")) { + priv->quorum_count = 0; + } else if (!strcmp(qtype, "auto")) { + priv->quorum_count = AFR_QUORUM_AUTO; + } +} -out: +int +afr_set_favorite_child_policy(afr_private_t *priv, char *policy) +{ + int index = -1; - return ret; + index = gf_get_index_by_elem(afr_favorite_child_policies, policy); + if (index < 0 || index >= AFR_FAV_CHILD_POLICY_MAX) + return -1; + + priv->fav_child_policy = index; + + return 0; +} + +static void +set_data_self_heal_algorithm(afr_private_t *priv, char *algo) +{ + if (!algo) { + priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC; + } else if (strcmp(algo, "full") == 0) { + priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_FULL; + } else if (strcmp(algo, "diff") == 0) { + priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DIFF; + } else { + priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC; + } } +void +afr_handle_anon_inode_options(afr_private_t *priv, dict_t *options) +{ + char *volfile_id_str = NULL; + uuid_t anon_inode_gfid = {0}; + + /*If volume id is not present don't enable anything*/ + if (dict_get_str(options, "volume-id", &volfile_id_str)) + return; + GF_ASSERT(strlen(AFR_ANON_DIR_PREFIX) + strlen(volfile_id_str) <= NAME_MAX); + /*anon_inode_name is not supposed to change once assigned*/ + if (!priv->anon_inode_name[0]) { + snprintf(priv->anon_inode_name, sizeof(priv->anon_inode_name), "%s-%s", + AFR_ANON_DIR_PREFIX, volfile_id_str); + gf_uuid_parse(volfile_id_str, anon_inode_gfid); + /*Flip a bit to make sure volfile-id and anon-gfid are not same*/ + anon_inode_gfid[0] ^= 1; + uuid_utoa_r(anon_inode_gfid, priv->anon_gfid_str); + } +} int -reconfigure (xlator_t *this, dict_t *options) +reconfigure(xlator_t *this, dict_t *options) { + afr_private_t *priv = NULL; + xlator_t *read_subvol = NULL; + int read_subvol_index = -1; + int timeout_old = 0; + int ret = -1; + int index = -1; + char *qtype = NULL; + char *fav_child_policy = NULL; + char *data_self_heal = NULL; + char *data_self_heal_algorithm = NULL; + char *locking_scheme = NULL; + gf_boolean_t consistent_io = _gf_false; + gf_boolean_t choose_local_old = _gf_false; + gf_boolean_t enabled_old = _gf_false; - gf_boolean_t metadata_self_heal; /* on/off */ - gf_boolean_t entry_self_heal; - gf_boolean_t data_self_heal; - gf_boolean_t data_change_log; /* on/off */ - gf_boolean_t metadata_change_log; /* on/off */ - gf_boolean_t entry_change_log; /* on/off */ - gf_boolean_t strict_readdir; - - afr_private_t * priv = NULL; - xlator_list_t * trav = NULL; - - char * read_subvol = NULL; - char * self_heal = NULL; - char * change_log = NULL; - char * str_readdir = NULL; - char * self_heal_algo = NULL; - - int32_t background_count = 0; - int32_t window_size = 0; - - int read_ret = -1; - int dict_ret = -1; - int flag = 1; - int ret = 0; - int temp_ret = -1; - - priv = this->private; - - dict_ret = dict_get_int32 (options, "background-self-heal-count", - &background_count); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring background self-heal count to %d", - background_count); - - priv->background_self_heal_count = background_count; - } - - dict_ret = dict_get_str (options, "metadata-self-heal", - &self_heal); - if (dict_ret == 0) { - temp_ret = gf_string2boolean (self_heal, &metadata_self_heal); - if (temp_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Reconfiguration Invalid 'option metadata" - "-self-heal %s'. Defaulting to old value.", - self_heal); - ret = -1; - goto out; - } - - priv->metadata_self_heal = metadata_self_heal; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option metadata" - "-self-heal %s'.", - self_heal); - } - - dict_ret = dict_get_str (options, "data-self-heal", - &self_heal); - if (dict_ret == 0) { - temp_ret = gf_string2boolean (self_heal, &data_self_heal); - if (temp_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Reconfiguration Invalid 'option data" - "-self-heal %s'. Defaulting to old value.", - self_heal); - ret = -1; - goto out; - } - - priv->data_self_heal = data_self_heal; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option data" - "-self-heal %s'.", self_heal); - } - - dict_ret = dict_get_str (options, "entry-self-heal", - &self_heal); - if (dict_ret == 0) { - temp_ret = gf_string2boolean (self_heal, &entry_self_heal); - if (temp_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Reconfiguration Invalid 'option data" - "-self-heal %s'. Defaulting to old value.", - self_heal); - ret = -1; - goto out; - } - - priv->entry_self_heal = entry_self_heal; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option entry" - "-self-heal %s'.", self_heal); - } - - - dict_ret = dict_get_str (options, "strict-readdir", - &str_readdir); - if (dict_ret == 0) { - temp_ret = gf_string2boolean (str_readdir, &strict_readdir); - if (temp_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option strict-readdir %s'. " - "Defaulting to old value.", - str_readdir); - ret = -1; - goto out; - } - - priv->strict_readdir = strict_readdir; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option strict" - "-readdir %s'.", str_readdir); - } - - dict_ret = dict_get_int32 (options, "data-self-heal-window-size", - &window_size); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring, Setting data self-heal window size to %d", - window_size); - - priv->data_self_heal_window_size = window_size; - } - else { - priv->data_self_heal_window_size = 16; - } + priv = this->private; + GF_OPTION_RECONF("metadata-splitbrain-forced-heal", + priv->metadata_splitbrain_forced_heal, options, bool, out); - dict_ret = dict_get_str (options, "data-change-log", - &change_log); - if (dict_ret == 0) { - temp_ret = gf_string2boolean (change_log, &data_change_log); - if (temp_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Reconfiguration Invalid 'option data-" - "change-log %s'. Defaulting to old value.", - change_log); - ret = -1; - goto out; - } - - priv->data_change_log = data_change_log; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option data-" - "change-log %s'.", change_log); - } - - dict_ret = dict_get_str (options, "metadata-change-log", - &change_log); - if (dict_ret == 0) { - temp_ret = gf_string2boolean (change_log, - &metadata_change_log); - if (temp_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option metadata-change-log %s'. " - "Defaulting to metadata-change-log as 'off'.", - change_log); - ret = -1; - goto out; - } - - priv->metadata_change_log = metadata_change_log; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option metadata-" - "change-log %s'.", change_log); - } - - dict_ret = dict_get_str (options, "entry-change-log", - &change_log); - if (dict_ret == 0) { - temp_ret = gf_string2boolean (change_log, &entry_change_log); - if (temp_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option entry-change-log %s'. " - "Defaulting to entry-change-log as 'on'.", - change_log); - ret = -1; - goto out; - } - - priv->entry_change_log = entry_change_log; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option entry-" - "change-log %s'.", change_log); - } - - dict_ret = dict_get_str (options, "data-self-heal-algorithm", - &self_heal_algo); - if (dict_ret == 0) { - /* Handling both strcmp cases - s1 > s2 and s1 < s2 */ - - if (!strcmp (self_heal_algo, "full")) { - priv->data_self_heal_algorithm = self_heal_algo; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option data-self" - "heal-algorithm %s'.", self_heal_algo); - goto next; - } - - if (!strcmp (self_heal_algo, "diff")) { - priv->data_self_heal_algorithm = self_heal_algo; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option data-self" - "heal-algorithm %s'.", self_heal_algo); - goto next; - } - - gf_log (this->name, GF_LOG_WARNING, - "Invalid self-heal algorithm %s," - "defaulting back to old value", - self_heal_algo); - ret = -1; - goto out; - } + GF_OPTION_RECONF("background-self-heal-count", + priv->background_self_heal_count, options, uint32, out); - read_ret = dict_get_str (options, "read-subvolume", &read_subvol); + GF_OPTION_RECONF("heal-wait-queue-length", priv->heal_wait_qlen, options, + uint32, out); - if (read_ret < 0) - goto next;// No need to traverse, hence set the next option + GF_OPTION_RECONF("metadata-self-heal", priv->metadata_self_heal, options, + bool, out); - trav = this->children; - flag = 0; - while (trav) { - if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) { - gf_log (this->name, GF_LOG_DEBUG, - "Subvolume '%s' specified as read child.", - trav->xlator->name); + GF_OPTION_RECONF("data-self-heal", data_self_heal, options, str, out); + if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1) + goto out; - flag = 1; - break; - } + GF_OPTION_RECONF("entry-self-heal", priv->entry_self_heal, options, bool, + out); + GF_OPTION_RECONF("data-self-heal-window-size", + priv->data_self_heal_window_size, options, uint32, out); - trav = trav->next; - } + GF_OPTION_RECONF("data-self-heal-algorithm", data_self_heal_algorithm, + options, str, out); + set_data_self_heal_algorithm(priv, data_self_heal_algorithm); - if (flag == 0 ) { + GF_OPTION_RECONF("halo-enabled", priv->halo_enabled, options, bool, out); - gf_log (this->name, GF_LOG_ERROR, - "Invalid 'option read-subvolume %s', no such subvolume" - , read_subvol); - ret = -1; - goto out; - } + GF_OPTION_RECONF("halo-shd-max-latency", priv->shd.halo_max_latency_msec, + options, uint32, out); + GF_OPTION_RECONF("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec, + options, uint32, out); -next: -out: - return ret; + GF_OPTION_RECONF("halo-max-latency", priv->halo_max_latency_msec, options, + uint32, out); + + GF_OPTION_RECONF("halo-max-replicas", priv->halo_max_replicas, options, + uint32, out); + GF_OPTION_RECONF("halo-min-replicas", priv->halo_min_replicas, options, + uint32, out); + + GF_OPTION_RECONF("read-subvolume", read_subvol, options, xlator, out); + + choose_local_old = priv->choose_local; + GF_OPTION_RECONF("choose-local", priv->choose_local, options, bool, out); + + if (choose_local_old != priv->choose_local) { + priv->read_child = -1; + if (choose_local_old == _gf_false) + priv->did_discovery = _gf_false; + } + + GF_OPTION_RECONF("read-hash-mode", priv->hash_mode, options, uint32, out); + + if (read_subvol) { + index = xlator_subvolume_index(this, read_subvol); + if (index == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, + "%s not a subvolume", read_subvol->name); + goto out; + } + priv->read_child = index; + } + + GF_OPTION_RECONF("read-subvolume-index", read_subvol_index, options, int32, + out); + + if (read_subvol_index > -1) { + index = read_subvol_index; + if (index >= priv->child_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, + "%d not a subvolume-index", index); + goto out; + } + priv->read_child = index; + } + + GF_OPTION_RECONF("pre-op-compat", priv->pre_op_compat, options, bool, out); + GF_OPTION_RECONF("locking-scheme", locking_scheme, options, str, out); + priv->granular_locks = (strcmp(locking_scheme, "granular") == 0); + GF_OPTION_RECONF("full-lock", priv->full_lock, options, bool, out); + GF_OPTION_RECONF("granular-entry-heal", priv->esh_granular, options, bool, + out); + + GF_OPTION_RECONF("eager-lock", priv->eager_lock, options, bool, out); + GF_OPTION_RECONF("optimistic-change-log", priv->optimistic_change_log, + options, bool, out); + GF_OPTION_RECONF("quorum-type", qtype, options, str, out); + GF_OPTION_RECONF("quorum-count", priv->quorum_count, options, uint32, out); + fix_quorum_options(this, priv, qtype, options); + if (priv->quorum_count && !afr_has_quorum(priv->child_up, this, NULL)) + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL, + "Client-quorum is not met"); + + GF_OPTION_RECONF("post-op-delay-secs", priv->post_op_delay_secs, options, + uint32, out); + + GF_OPTION_RECONF(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, options, + size_uint64, out); + /* Reset this so we re-discover in case the topology changed. */ + GF_OPTION_RECONF("ensure-durability", priv->ensure_durability, options, + bool, out); + + enabled_old = priv->shd.enabled; + GF_OPTION_RECONF("self-heal-daemon", priv->shd.enabled, options, bool, out); + + GF_OPTION_RECONF("iam-self-heal-daemon", priv->shd.iamshd, options, bool, + out); + + timeout_old = priv->shd.timeout; + GF_OPTION_RECONF("heal-timeout", priv->shd.timeout, options, int32, out); + + GF_OPTION_RECONF("consistent-metadata", priv->consistent_metadata, options, + bool, out); + + GF_OPTION_RECONF("shd-max-threads", priv->shd.max_threads, options, uint32, + out); + + GF_OPTION_RECONF("shd-wait-qlength", priv->shd.wait_qlength, options, + uint32, out); + + GF_OPTION_RECONF("favorite-child-policy", fav_child_policy, options, str, + out); + if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1) + goto out; + + priv->did_discovery = _gf_false; + + GF_OPTION_RECONF("consistent-io", consistent_io, options, bool, out); + if (priv->quorum_count != 0) + consistent_io = _gf_false; + priv->consistent_io = consistent_io; + + afr_handle_anon_inode_options(priv, options); + + GF_OPTION_RECONF("use-anonymous-inode", priv->use_anon_inode, options, bool, + out); + if (priv->shd.enabled) { + if ((priv->shd.enabled != enabled_old) || + (timeout_old != priv->shd.timeout)) + afr_selfheal_childup(this, priv); + } + + ret = 0; +out: + return ret; } +static int +afr_pending_xattrs_init(afr_private_t *priv, xlator_t *this) +{ + int ret = -1; + int i = 0; + char *ptr = NULL; + char *ptr1 = NULL; + char *xattrs_list = NULL; + xlator_list_t *trav = NULL; + int child_count = -1; + + trav = this->children; + child_count = priv->child_count; + if (priv->thin_arbiter_count) { + /* priv->pending_key[THIN_ARBITER_BRICK_INDEX] is used as the + * name of the thin arbiter file for persistence across add/ + * removal of DHT subvols.*/ + child_count++; + } + + GF_OPTION_INIT("afr-pending-xattr", xattrs_list, str, out); + priv->pending_key = GF_CALLOC(sizeof(*priv->pending_key), child_count, + gf_afr_mt_char); + if (!priv->pending_key) { + ret = -ENOMEM; + goto out; + } + if (!xattrs_list) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_NO_CHANGELOG, + "Unable to fetch afr-pending-xattr option from volfile." + " Falling back to using client translator names. "); + + while (i < child_count) { + ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX, + trav->xlator->name); + if (ret == -1) { + ret = -ENOMEM; + goto out; + } + trav = trav->next; + i++; + } + ret = 0; + goto out; + } + + ptr = ptr1 = gf_strdup(xattrs_list); + if (!ptr) { + ret = -ENOMEM; + goto out; + } + for (i = 0, ptr = strtok(ptr, ","); ptr; ptr = strtok(NULL, ",")) { + ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX, + ptr); + if (ret == -1) { + ret = -ENOMEM; + goto out; + } + i++; + } + ret = 0; -static const char *favorite_child_warning_str = "You have specified subvolume '%s' " - "as the 'favorite child'. This means that if a discrepancy in the content " - "or attributes (ownership, permission, etc.) of a file is detected among " - "the subvolumes, the file on '%s' will be considered the definitive " - "version and its contents will OVERWRITE the contents of the file on other " - "subvolumes. All versions of the file except that on '%s' " - "WILL BE LOST."; +out: + GF_FREE(ptr1); + return ret; +} -static const char *no_lock_servers_warning_str = "You have set lock-server-count = 0. " - "This means correctness is NO LONGER GUARANTEED in all cases. If two or more " - "applications write to the same region of a file, there is a possibility that " - "its copies will be INCONSISTENT. Set it to a value greater than 0 unless you " - "are ABSOLUTELY SURE of what you are doing and WILL NOT HOLD GlusterFS " - "RESPONSIBLE for inconsistent data. If you are in doubt, set it to a value " - "greater than 0."; +void +afr_ta_init(afr_private_t *priv) +{ + priv->thin_arbiter_count = 1; + priv->child_count--; + priv->ta_child_up = 0; + priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; + priv->ta_notify_dom_lock_offset = 0; + priv->ta_in_mem_txn_count = 0; + priv->ta_on_wire_txn_count = 0; + priv->release_ta_notify_dom_lock = _gf_false; + INIT_LIST_HEAD(&priv->ta_waitq); + INIT_LIST_HEAD(&priv->ta_onwireq); + gf_uuid_clear(priv->ta_gfid); +} int32_t -init (xlator_t *this) +init(xlator_t *this) { - afr_private_t * priv = NULL; - int child_count = 0; - xlator_list_t * trav = NULL; - int i = 0; - int ret = -1; - int op_errno = 0; - - char * read_subvol = NULL; - char * fav_child = NULL; - char * self_heal = NULL; - char * algo = NULL; - char * change_log = NULL; - char * strict_readdir = NULL; - char * inodelk_trace = NULL; - char * entrylk_trace = NULL; - - int32_t background_count = 0; - int32_t lock_server_count = 1; - int32_t window_size = 0; - - int fav_ret = -1; - int read_ret = -1; - int dict_ret = -1; - - - if (!this->children) { - gf_log (this->name, GF_LOG_ERROR, - "replicate translator needs more than one " - "subvolume defined."); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "Volume is dangling."); - } - - - ALLOC_OR_GOTO (this->private, afr_private_t, out); - - priv = this->private; - - read_ret = dict_get_str (this->options, "read-subvolume", &read_subvol); - priv->read_child = -1; - - fav_ret = dict_get_str (this->options, "favorite-child", &fav_child); - priv->favorite_child = -1; - - priv->background_self_heal_count = 16; - - dict_ret = dict_get_int32 (this->options, "background-self-heal-count", - &background_count); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Setting background self-heal count to %d", - background_count); - - priv->background_self_heal_count = background_count; - } - - /* Default values */ - - priv->data_self_heal = 1; - priv->metadata_self_heal = 1; - priv->entry_self_heal = 1; - - dict_ret = dict_get_str (this->options, "data-self-heal", &self_heal); - if (dict_ret == 0) { - ret = gf_string2boolean (self_heal, &priv->data_self_heal); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option data-self-heal %s'. " - "Defaulting to data-self-heal as 'on'", - self_heal); - priv->data_self_heal = 1; - } - } - - priv->data_self_heal_algorithm = ""; - - dict_ret = dict_get_str (this->options, "data-self-heal-algorithm", - &algo); - if (dict_ret == 0) { - priv->data_self_heal_algorithm = gf_strdup (algo); + afr_private_t *priv = NULL; + int child_count = 0; + xlator_list_t *trav = NULL; + int i = 0; + int ret = -1; + GF_UNUSED int op_errno = 0; + xlator_t *read_subvol = NULL; + int read_subvol_index = -1; + char *qtype = NULL; + char *fav_child_policy = NULL; + char *thin_arbiter = NULL; + char *data_self_heal = NULL; + char *locking_scheme = NULL; + char *data_self_heal_algorithm = NULL; + + if (!this->children) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_CHILD_MISCONFIGURED, + "replicate translator needs more than one " + "subvolume defined."); + return -1; + } + + if (!this->parents) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_VOL_MISCONFIGURED, + "Volume is dangling."); + } + + this->private = GF_CALLOC(1, sizeof(afr_private_t), + gf_afr_mt_afr_private_t); + if (!this->private) + goto out; + + priv = this->private; + INIT_LIST_HEAD(&priv->saved_locks); + INIT_LIST_HEAD(&priv->lk_healq); + LOCK_INIT(&priv->lock); + + child_count = xlator_subvolume_count(this); + + priv->child_count = child_count; + + priv->read_child = -1; + + GF_OPTION_INIT("arbiter-count", priv->arbiter_count, uint32, out); + GF_OPTION_INIT("thin-arbiter", thin_arbiter, str, out); + if (thin_arbiter && strlen(thin_arbiter) > 0) { + afr_ta_init(priv); + } + INIT_LIST_HEAD(&priv->healing); + INIT_LIST_HEAD(&priv->heal_waiting); + + priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT; + + GF_OPTION_INIT("afr-dirty-xattr", priv->afr_dirty, str, out); + + GF_OPTION_INIT("metadata-splitbrain-forced-heal", + priv->metadata_splitbrain_forced_heal, bool, out); + + GF_OPTION_INIT("read-subvolume", read_subvol, xlator, out); + if (read_subvol) { + priv->read_child = xlator_subvolume_index(this, read_subvol); + if (priv->read_child == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, + "%s not a subvolume", read_subvol->name); + goto out; + } + } + GF_OPTION_INIT("read-subvolume-index", read_subvol_index, int32, out); + if (read_subvol_index > -1) { + if (read_subvol_index >= priv->child_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, + "%d not a subvolume-index", read_subvol_index); + goto out; } + priv->read_child = read_subvol_index; + } + GF_OPTION_INIT("choose-local", priv->choose_local, bool, out); + priv->pending_reads = GF_CALLOC(sizeof(*priv->pending_reads), + priv->child_count, gf_afr_mt_atomic_t); - priv->data_self_heal_window_size = 16; - - dict_ret = dict_get_int32 (this->options, "data-self-heal-window-size", - &window_size); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Setting data self-heal window size to %d", - window_size); - - priv->data_self_heal_window_size = window_size; - } - - dict_ret = dict_get_str (this->options, "metadata-self-heal", - &self_heal); - if (dict_ret == 0) { - ret = gf_string2boolean (self_heal, &priv->metadata_self_heal); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option metadata-self-heal %s'. " - "Defaulting to metadata-self-heal as 'on'.", - self_heal); - priv->metadata_self_heal = 1; - } - } - - dict_ret = dict_get_str (this->options, "entry-self-heal", &self_heal); - if (dict_ret == 0) { - ret = gf_string2boolean (self_heal, &priv->entry_self_heal); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option entry-self-heal %s'. " - "Defaulting to entry-self-heal as 'on'.", - self_heal); - priv->entry_self_heal = 1; - } - } - - /* Change log options */ - - priv->data_change_log = 1; - priv->metadata_change_log = 1; - priv->entry_change_log = 1; - priv->optimistic_change_log = 1; - - dict_ret = dict_get_str (this->options, "data-change-log", - &change_log); - if (dict_ret == 0) { - ret = gf_string2boolean (change_log, &priv->data_change_log); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option data-change-log %s'. " - "Defaulting to data-change-log as 'on'.", - change_log); - priv->data_change_log = 1; - } - } - - dict_ret = dict_get_str (this->options, "metadata-change-log", - &change_log); - if (dict_ret == 0) { - ret = gf_string2boolean (change_log, - &priv->metadata_change_log); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option metadata-change-log %s'. " - "Defaulting to metadata-change-log as 'off'.", - change_log); - priv->metadata_change_log = 0; - } - } - - dict_ret = dict_get_str (this->options, "entry-change-log", - &change_log); - if (dict_ret == 0) { - ret = gf_string2boolean (change_log, &priv->entry_change_log); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option entry-change-log %s'. " - "Defaulting to entry-change-log as 'on'.", - change_log); - priv->entry_change_log = 1; - } - } - - dict_ret = dict_get_str (this->options, "optimistic-change-log", - &change_log); - if (dict_ret == 0) { - ret = gf_string2boolean (change_log, &priv->optimistic_change_log); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option optimistic-change-log %s'. " - "Defaulting to optimistic-change-log as 'on'.", - change_log); - priv->optimistic_change_log = 1; - } - } - - /* Locking options */ - - priv->inodelk_trace = 0; - priv->entrylk_trace = 0; - - dict_ret = dict_get_str (this->options, "inodelk-trace", - &inodelk_trace); - if (dict_ret == 0) { - ret = gf_string2boolean (inodelk_trace, &priv->inodelk_trace); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option inodelk-trace %s' ", - inodelk_trace); - - priv->inodelk_trace = 0; - } - } - - - dict_ret = dict_get_str (this->options, "entrylk-trace", - &entrylk_trace); - if (dict_ret == 0) { - ret = gf_string2boolean (entrylk_trace, &priv->entrylk_trace); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option entrylk-trace %s' ", - inodelk_trace); - - priv->entrylk_trace = 0; - } - } - - - priv->data_lock_server_count = 1; - priv->metadata_lock_server_count = 0; - priv->entry_lock_server_count = 1; - - dict_ret = dict_get_int32 (this->options, "data-lock-server-count", - &lock_server_count); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Setting data lock server count to %d.", - lock_server_count); - - if (lock_server_count == 0) - gf_log (this->name, GF_LOG_WARNING, "%s", - no_lock_servers_warning_str); - - priv->data_lock_server_count = lock_server_count; - } - - - dict_ret = dict_get_int32 (this->options, - "metadata-lock-server-count", - &lock_server_count); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Setting metadata lock server count to %d.", - lock_server_count); - priv->metadata_lock_server_count = lock_server_count; - } - - - dict_ret = dict_get_int32 (this->options, "entry-lock-server-count", - &lock_server_count); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Setting entry lock server count to %d.", - lock_server_count); - - priv->entry_lock_server_count = lock_server_count; - } - - priv->strict_readdir = _gf_false; - - dict_ret = dict_get_str (this->options, "strict-readdir", - &strict_readdir); - if (dict_ret == 0) { - ret = gf_string2boolean (strict_readdir, &priv->strict_readdir); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option strict-readdir %s'. " - "Defaulting to strict-readdir as 'off'.", - strict_readdir); - } - } - - trav = this->children; - while (trav) { - if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) { - gf_log (this->name, GF_LOG_DEBUG, - "Subvolume '%s' specified as read child.", - trav->xlator->name); - - priv->read_child = child_count; - } - - if (fav_ret == 0 && !strcmp (fav_child, trav->xlator->name)) { - gf_log (this->name, GF_LOG_WARNING, - favorite_child_warning_str, trav->xlator->name, - trav->xlator->name, trav->xlator->name); - priv->favorite_child = child_count; - } - - child_count++; - trav = trav->next; - } - - priv->wait_count = 1; - - priv->child_count = child_count; - - LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->read_child_lock); - - priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, + GF_OPTION_INIT("read-hash-mode", priv->hash_mode, uint32, out); + + priv->favorite_child = -1; + + GF_OPTION_INIT("favorite-child-policy", fav_child_policy, str, out); + if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1) + goto out; + + GF_OPTION_INIT("shd-max-threads", priv->shd.max_threads, uint32, out); + + GF_OPTION_INIT("shd-wait-qlength", priv->shd.wait_qlength, uint32, out); + + GF_OPTION_INIT("background-self-heal-count", + priv->background_self_heal_count, uint32, out); + + GF_OPTION_INIT("heal-wait-queue-length", priv->heal_wait_qlen, uint32, out); + + GF_OPTION_INIT("data-self-heal", data_self_heal, str, out); + if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1) + goto out; + + GF_OPTION_INIT("data-self-heal-algorithm", data_self_heal_algorithm, str, + out); + set_data_self_heal_algorithm(priv, data_self_heal_algorithm); + + GF_OPTION_INIT("data-self-heal-window-size", + priv->data_self_heal_window_size, uint32, out); + + GF_OPTION_INIT("metadata-self-heal", priv->metadata_self_heal, bool, out); + + GF_OPTION_INIT("entry-self-heal", priv->entry_self_heal, bool, out); + + GF_OPTION_INIT("halo-shd-max-latency", priv->shd.halo_max_latency_msec, + uint32, out); + + GF_OPTION_INIT("halo-max-latency", priv->halo_max_latency_msec, uint32, + out); + GF_OPTION_INIT("halo-max-replicas", priv->halo_max_replicas, uint32, out); + GF_OPTION_INIT("halo-min-replicas", priv->halo_min_replicas, uint32, out); + + GF_OPTION_INIT("halo-enabled", priv->halo_enabled, bool, out); + + GF_OPTION_INIT("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec, + uint32, out); + + GF_OPTION_INIT("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out); + + GF_OPTION_INIT("optimistic-change-log", priv->optimistic_change_log, bool, + out); + + GF_OPTION_INIT("pre-op-compat", priv->pre_op_compat, bool, out); + GF_OPTION_INIT("locking-scheme", locking_scheme, str, out); + priv->granular_locks = (strcmp(locking_scheme, "granular") == 0); + GF_OPTION_INIT("full-lock", priv->full_lock, bool, out); + GF_OPTION_INIT("granular-entry-heal", priv->esh_granular, bool, out); + + GF_OPTION_INIT("eager-lock", priv->eager_lock, bool, out); + GF_OPTION_INIT("quorum-type", qtype, str, out); + GF_OPTION_INIT("quorum-count", priv->quorum_count, uint32, out); + GF_OPTION_INIT(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size_uint64, + out); + fix_quorum_options(this, priv, qtype, this->options); + + GF_OPTION_INIT("post-op-delay-secs", priv->post_op_delay_secs, uint32, out); + GF_OPTION_INIT("ensure-durability", priv->ensure_durability, bool, out); + + GF_OPTION_INIT("self-heal-daemon", priv->shd.enabled, bool, out); + + GF_OPTION_INIT("iam-self-heal-daemon", priv->shd.iamshd, bool, out); + GF_OPTION_INIT("heal-timeout", priv->shd.timeout, int32, out); + + GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out); + GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out); + afr_handle_anon_inode_options(priv, this->options); + + GF_OPTION_INIT("use-anonymous-inode", priv->use_anon_inode, bool, out); + if (priv->quorum_count != 0) + priv->consistent_io = _gf_false; + + priv->wait_count = 1; + + priv->local = GF_CALLOC(sizeof(unsigned char), child_count, gf_afr_mt_char); + if (!priv->local) { + ret = -ENOMEM; + goto out; + } + + priv->anon_inode = GF_CALLOC(sizeof(unsigned char), child_count, + gf_afr_mt_char); + + priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count, + gf_afr_mt_char); + + priv->child_latency = GF_MALLOC(sizeof(*priv->child_latency) * child_count, + gf_afr_mt_child_latency_t); + priv->halo_child_up = GF_CALLOC(sizeof(unsigned char), child_count, gf_afr_mt_char); - if (!priv->child_up) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - ret = -ENOMEM; - goto out; - } - - for (i = 0; i < child_count; i++) - priv->child_up[i] = -1; /* start with unknown state. - this initialization needed - for afr_notify() to work - reliably - */ - - priv->children = GF_CALLOC (sizeof (xlator_t *), child_count, - gf_afr_mt_xlator_t); - if (!priv->children) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - ret = -ENOMEM; - goto out; - } - - priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key), - child_count, - gf_afr_mt_char); - if (!priv->pending_key) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - ret = -ENOMEM; - goto out; + + if (!priv->child_up || !priv->child_latency || !priv->halo_child_up || + !priv->anon_inode) { + ret = -ENOMEM; + goto out; + } + /*Initialize to -ve ping timeout so that they are not considered + * in child-up events until ping-event comes*/ + for (i = 0; i < child_count; i++) + priv->child_latency[i] = -1; + + priv->children = GF_CALLOC(sizeof(xlator_t *), child_count, + gf_afr_mt_xlator_t); + if (!priv->children) { + ret = -ENOMEM; + goto out; + } + + ret = afr_pending_xattrs_init(priv, this); + if (ret) + goto out; + + trav = this->children; + i = 0; + while (i < child_count) { + priv->children[i] = trav->xlator; + trav = trav->next; + i++; + } + + ret = gf_asprintf(&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, this->name); + if (-1 == ret) { + ret = -ENOMEM; + goto out; + } + + priv->last_event = GF_CALLOC(child_count, sizeof(*priv->last_event), + gf_afr_mt_int32_t); + if (!priv->last_event) { + ret = -ENOMEM; + goto out; + } + + this->itable = inode_table_new(SHD_INODE_LRU_LIMIT, this); + if (!this->itable) { + ret = -ENOMEM; + goto out; + } + + if (priv->shd.iamshd) { + ret = afr_selfheal_daemon_init(this); + if (ret) { + ret = -ENOMEM; + goto out; } + } - trav = this->children; - i = 0; - while (i < child_count) { - priv->children[i] = trav->xlator; - - ret = gf_asprintf (&priv->pending_key[i], "%s.%s", - AFR_XATTR_PREFIX, - trav->xlator->name); - if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed to set pending key"); - ret = -ENOMEM; - goto out; - } - - trav = trav->next; - i++; - } - - LOCK_INIT (&priv->root_inode_lk); - priv->first_lookup = 1; - priv->root_inode = NULL; - - pthread_mutex_init (&priv->mutex, NULL); - INIT_LIST_HEAD (&priv->saved_fds); - - ret = 0; + /* keep more local here as we may need them for self-heal etc */ + this->local_pool = mem_pool_new(afr_local_t, 512); + if (!this->local_pool) { + ret = -1; + goto out; + } + + priv->root_inode = NULL; + + ret = 0; out: - return ret; + return ret; +} +void +afr_destroy_healer_object(xlator_t *this, struct subvol_healer *healer) +{ + int ret = -1; + + if (!healer) + return; + + if (healer->running) { + /* + * If there are any resources to cleanup, We need + * to do that gracefully using pthread_cleanup_push + */ + ret = gf_thread_cleanup_xint(healer->thread); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SELF_HEAL_FAILED, + "Failed to clean up healer threads."); + healer->thread = 0; + } + pthread_cond_destroy(&healer->cond); + pthread_mutex_destroy(&healer->mutex); } - -int -fini (xlator_t *this) +void +afr_selfheal_daemon_fini(xlator_t *this) { - return 0; + struct subvol_healer *healer = NULL; + afr_self_heald_t *shd = NULL; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + if (!priv) + return; + + shd = &priv->shd; + if (!shd->iamshd) + return; + + for (i = 0; i < priv->child_count; i++) { + healer = &shd->index_healers[i]; + afr_destroy_healer_object(this, healer); + + healer = &shd->full_healers[i]; + afr_destroy_healer_object(this, healer); + + if (shd->statistics[i]) + eh_destroy(shd->statistics[i]); + } + GF_FREE(shd->index_healers); + GF_FREE(shd->full_healers); + GF_FREE(shd->statistics); + if (shd->split_brain) + eh_destroy(shd->split_brain); } +void +fini(xlator_t *this) +{ + afr_private_t *priv = NULL; + + priv = this->private; + + afr_selfheal_daemon_fini(this); + GF_ASSERT(list_empty(&priv->saved_locks)); + + LOCK(&priv->lock); + if (priv->timer != NULL) { + gf_timer_call_cancel(this->ctx, priv->timer); + priv->timer = NULL; + } + UNLOCK(&priv->lock); + + if (this->local_pool != NULL) { + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + } + + this->private = NULL; + afr_priv_destroy(priv); + if (this->itable) { + inode_table_destroy(this->itable); + this->itable = NULL; + } + return; +} struct xlator_fops fops = { - .lookup = afr_lookup, - .open = afr_open, - .lk = afr_lk, - .flush = afr_flush, - .statfs = afr_statfs, - .fsync = afr_fsync, - .fsyncdir = afr_fsyncdir, - .xattrop = afr_xattrop, - .fxattrop = afr_fxattrop, - .inodelk = afr_inodelk, - .finodelk = afr_finodelk, - .entrylk = afr_entrylk, - .fentrylk = afr_fentrylk, - - /* inode read */ - .access = afr_access, - .stat = afr_stat, - .fstat = afr_fstat, - .readlink = afr_readlink, - .getxattr = afr_getxattr, - .readv = afr_readv, - - /* inode write */ - .writev = afr_writev, - .truncate = afr_truncate, - .ftruncate = afr_ftruncate, - .setxattr = afr_setxattr, - .setattr = afr_setattr, - .fsetattr = afr_fsetattr, - .removexattr = afr_removexattr, - - /* dir read */ - .opendir = afr_opendir, - .readdir = afr_readdir, - .readdirp = afr_readdirp, - - /* dir write */ - .create = afr_create, - .mknod = afr_mknod, - .mkdir = afr_mkdir, - .unlink = afr_unlink, - .rmdir = afr_rmdir, - .link = afr_link, - .symlink = afr_symlink, - .rename = afr_rename, + .lookup = afr_lookup, + .lk = afr_lk, + .flush = afr_flush, + .statfs = afr_statfs, + .fsyncdir = afr_fsyncdir, + .inodelk = afr_inodelk, + .finodelk = afr_finodelk, + .entrylk = afr_entrylk, + .fentrylk = afr_fentrylk, + .ipc = afr_ipc, + .lease = afr_lease, + + /* inode read */ + .access = afr_access, + .stat = afr_stat, + .fstat = afr_fstat, + .readlink = afr_readlink, + .getxattr = afr_getxattr, + .fgetxattr = afr_fgetxattr, + .readv = afr_readv, + .seek = afr_seek, + + /* inode write */ + .writev = afr_writev, + .truncate = afr_truncate, + .ftruncate = afr_ftruncate, + .setxattr = afr_setxattr, + .fsetxattr = afr_fsetxattr, + .setattr = afr_setattr, + .fsetattr = afr_fsetattr, + .removexattr = afr_removexattr, + .fremovexattr = afr_fremovexattr, + .fallocate = afr_fallocate, + .discard = afr_discard, + .zerofill = afr_zerofill, + .xattrop = afr_xattrop, + .fxattrop = afr_fxattrop, + .fsync = afr_fsync, + + /*inode open*/ + .opendir = afr_opendir, + .open = afr_open, + + /* dir read */ + .readdir = afr_readdir, + .readdirp = afr_readdirp, + + /* dir write */ + .create = afr_create, + .mknod = afr_mknod, + .mkdir = afr_mkdir, + .unlink = afr_unlink, + .rmdir = afr_rmdir, + .link = afr_link, + .symlink = afr_symlink, + .rename = afr_rename, }; - struct xlator_dumpops dumpops = { - .priv = afr_priv_dump, + .priv = afr_priv_dump, }; - struct xlator_cbks cbks = { - .release = afr_release, - .releasedir = afr_releasedir, + .release = afr_release, + .releasedir = afr_releasedir, + .forget = afr_forget, }; - struct volume_options options[] = { - { .key = {"read-subvolume" }, - .type = GF_OPTION_TYPE_XLATOR - }, - { .key = {"favorite-child"}, - .type = GF_OPTION_TYPE_XLATOR - }, - { .key = {"background-self-heal-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 0 - }, - { .key = {"data-self-heal"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"data-self-heal-algorithm"}, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"data-self-heal-window-size"}, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .max = 1024 - }, - { .key = {"metadata-self-heal"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"entry-self-heal"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"data-change-log"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"metadata-change-log"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"entry-change-log"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"optimistic-change-log"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"data-lock-server-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 0 - }, - { .key = {"metadata-lock-server-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 0 - }, - { .key = {"entry-lock-server-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 0 - }, - { .key = {"strict-readdir"}, - .type = GF_OPTION_TYPE_BOOL, - }, - { .key = {NULL} }, + {.key = {"read-subvolume"}, + .type = GF_OPTION_TYPE_XLATOR, + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "inode-read fops happen only on one of the bricks in " + "replicate. Afr will prefer the one specified using " + "this option if it is not stale. Option value must be " + "one of the xlator names of the children. " + "Ex: <volname>-client-0 till " + "<volname>-client-<number-of-bricks - 1>"}, + {.key = {"read-subvolume-index"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "-1", + .op_version = {2}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "inode-read fops happen only on one of the bricks in " + "replicate. AFR will prefer the one specified using " + "this option if it is not stale. allowed options" + " include -1 till replica-count - 1"}, + {.key = {"read-hash-mode"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 5, + .default_value = "1", + .op_version = {2}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = + "inode-read fops happen only on one of the bricks in " + "replicate. AFR will prefer the one computed using " + "the method specified using this option.\n" + "0 = first readable child of AFR, starting from 1st child.\n" + "1 = hash by GFID of file (all clients use " + "same subvolume).\n" + "2 = hash by GFID of file and client PID.\n" + "3 = brick having the least outstanding read requests.\n" + "4 = brick having the least network ping latency.\n" + "5 = Hybrid mode between 3 and 4, ie least value among " + "network-latency multiplied by outstanding-read-requests."}, + { + .key = {"choose-local"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .op_version = {2}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "Choose a local subvolume (i.e. Brick) to read from" + " if read-subvolume is not explicitly set.", + }, + {.key = {"background-self-heal-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 256, + .default_value = "8", + .validate = GF_OPT_VALIDATE_MIN, + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This specifies the number of per client self-heal " + "jobs that can perform parallel heals in the " + "background."}, + {.key = {"halo-shd-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "99999", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate", "halo"}, + .description = "Maximum latency for shd halo replication in msec."}, + {.key = {"halo-enabled"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "False", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate", "halo"}, + .description = "Enable Halo (geo) replication mode."}, + {.key = {"halo-nfsd-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "5", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate", "halo"}, + .description = "Maximum latency for nfsd halo replication in msec."}, + {.key = {"halo-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = AFR_HALO_MAX_LATENCY, + .default_value = "5", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate", "halo"}, + .description = "Maximum latency for halo replication in msec."}, + {.key = {"halo-max-replicas"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "99999", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate", "halo"}, + .description = "The maximum number of halo replicas; replicas" + " beyond this value will be written asynchronously" + "via the SHD."}, + {.key = {"halo-min-replicas"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "2", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate", "halo"}, + .description = "The minimmum number of halo replicas, before adding " + "out of region replicas."}, + {.key = {"heal-wait-queue-length"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 10000, /*Around 100MB with sizeof(afr_local_t)= 10496 bytes*/ + .default_value = "128", + .validate = GF_OPT_VALIDATE_MIN, + .op_version = {GD_OP_VERSION_3_7_10}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This specifies the number of heals that can be queued" + " for the parallel background self heal jobs."}, + {.key = {"data-self-heal"}, + .type = GF_OPTION_TYPE_STR, + .value = {"1", "on", "yes", "true", "enable", "0", "off", "no", "false", + "disable", "open"}, + .default_value = "off", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "Using this option we can enable/disable data " + "self-heal on the file. \"open\" means data " + "self-heal action will only be triggered by file " + "open operations."}, + {.key = {"data-self-heal-algorithm"}, + .type = GF_OPTION_TYPE_STR, + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "Select between \"full\", \"diff\". The " + "\"full\" algorithm copies the entire file from " + "source to sink. The \"diff\" algorithm copies to " + "sink only those blocks whose checksums don't match " + "with those of source. If no option is configured " + "the option is chosen dynamically as follows: " + "If the file does not exist on one of the sinks " + "or empty file exists or if the source file size is " + "about the same as page size the entire file will " + "be read and written i.e \"full\" algo, " + "otherwise \"diff\" algo is chosen.", + .value = {"diff", "full"}}, + {.key = {"data-self-heal-window-size"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 1024, + .default_value = "1", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "Maximum number blocks per file for which self-heal " + "process would be applied simultaneously."}, + {.key = {"metadata-self-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + /*.validate_fn = validate_replica*/ + .description = "Using this option we can enable/disable metadata " + "i.e. Permissions, ownerships, xattrs self-heal on " + "the file/directory."}, + {.key = {"entry-self-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + /*.validate_fn = validate_replica*/ + .description = "Using this option we can enable/disable entry " + "self-heal on the directory."}, + {.key = {"data-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This option exists only for backward compatibility " + "and configuring it doesn't have any effect"}, + {.key = {"metadata-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This option exists only for backward compatibility " + "and configuring it doesn't have any effect"}, + {.key = {"entry-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This option exists only for backward compatibility " + "and configuring it doesn't have any effect"}, + {.key = {"optimistic-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Entry/Metadata fops will not perform " + "pre fop changelog operations in afr transaction " + "if this option is enabled."}, + {.key = {"inodelk-trace"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enabling this option logs inode lock/unlocks"}, + {.key = {"entrylk-trace"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enabling this option logs entry lock/unlocks"}, + {.key = {"pre-op-compat"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Use separate pre-op xattrop() FOP rather than " + "overloading xdata of the OP"}, + {.key = {"eager-lock"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = + "Enable/Disable eager lock for replica volume. " + "Lock phase of a transaction has two sub-phases. " + "First is an attempt to acquire locks in parallel by " + "broadcasting non-blocking lock requests. If lock " + "acquisition fails on any server, then the held locks " + "are unlocked and we revert to a blocking locks mode " + "sequentially on one server after another. If this " + "option is enabled the initial broadcasting lock " + "request attempts to acquire a full lock on the entire file. " + "If this fails, we revert back to the sequential " + "\"regional\" blocking locks as before. In the case " + "where such an \"eager\" lock is granted in the " + "non-blocking phase, it gives rise to an opportunity " + "for optimization. i.e, if the next write transaction " + "on the same FD arrives before the unlock phase of " + "the first transaction, it \"takes over\" the full " + "file lock. Similarly if yet another data transaction " + "arrives before the unlock phase of the \"optimized\" " + "transaction, that in turn \"takes over\" the lock as " + "well. The actual unlock now happens at the end of " + "the last \"optimized\" transaction." + + }, + {.key = {"self-heal-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, + .tags = {"replicate"}, + /*.validate_fn = validate_replica_heal_enable_disable*/ + .description = "This option applies to only self-heal-daemon. " + "Index directory crawl and automatic healing of files " + "will not be performed if this option is turned off."}, + {.key = {"iam-self-heal-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option differentiates if the replicate " + "translator is running as part of self-heal-daemon " + "or not."}, + {.key = {"iam-nfs-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option differentiates if the replicate " + "translator is running as part of an NFS daemon " + "or not."}, + { + .key = {"quorum-type"}, + .type = GF_OPTION_TYPE_STR, + .value = {"none", "auto", "fixed"}, + .default_value = "none", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + /*.option = quorum-type*/ + .description = "If value is \"fixed\" only allow writes if " + "quorum-count bricks are present. If value is " + "\"auto\" only allow writes if more than half of " + "bricks, or exactly half including the first, are " + "present.", + }, + { + .key = {"quorum-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = INT_MAX, + .default_value = 0, + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + /*.option = quorum-count*/ + /*.validate_fn = validate_quorum_count*/ + .description = "If quorum-type is \"fixed\" only allow writes if " + "this many bricks are present. Other quorum types " + "will OVERWRITE this value.", + }, + { + .key = {"quorum-reads"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This option has been removed. Reads are not allowed " + "if quorum is not met.", + }, + { + .key = {"node-uuid"}, + .type = GF_OPTION_TYPE_STR, + .description = "Local glusterd uuid string, used in starting " + "self-heal-daemon so that it can crawl only on " + "local index directories.", + }, + { + .key = {"post-op-delay-secs"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = INT_MAX, + .default_value = "1", + .op_version = {2}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "Time interval induced artificially before " + "post-operation phase of the transaction to " + "enhance overlap of adjacent write operations.", + }, + { + .key = {AFR_SH_READDIR_SIZE_KEY}, + .type = GF_OPTION_TYPE_SIZET, + .description = "readdirp size for performing entry self-heal", + .min = 1024, + .max = 131072, + .op_version = {2}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, + .tags = {"replicate"}, + .default_value = "1KB", + }, + { + .key = {"ensure-durability"}, + .type = GF_OPTION_TYPE_BOOL, + .op_version = {3}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "Afr performs fsyncs for transactions if this " + "option is on to make sure the changelogs/data is " + "written to the disk", + .default_value = "on", + }, + { + .key = {"afr-dirty-xattr"}, + .type = GF_OPTION_TYPE_STR, + .default_value = AFR_DIRTY_DEFAULT, + }, + {.key = {"afr-pending-xattr"}, + .type = GF_OPTION_TYPE_STR, + .description = "Comma separated list of xattrs that are used to " + "capture information on pending heals."}, + { + .key = {"metadata-splitbrain-forced-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, + {.key = {"heal-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 5, + .max = INT_MAX, + .default_value = "600", + .op_version = {2}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "time interval for checking the need to self-heal " + "in self-heal-daemon"}, + { + .key = {"consistent-metadata"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "If this option is enabled, readdirp will force " + "lookups on those entries read whose read child is " + "not the same as that of the parent. This will " + "guarantee that all read operations on a file serve " + "attributes from the same subvol as long as it holds " + " a good copy of the file/dir.", + }, + {.key = {"arbiter-count"}, + .type = GF_OPTION_TYPE_INT, + .description = "subset of child_count. Has to be 0 or 1."}, + { + .key = {"thin-arbiter"}, + .type = GF_OPTION_TYPE_STR, + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE, + .tags = {"replicate"}, + .description = "contains host:path of thin abriter brick", + }, + {.key = {"shd-max-threads"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 64, + .default_value = "1", + .op_version = {GD_OP_VERSION_3_7_12}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "Maximum number of parallel heals SHD can do per " + "local brick. This can substantially lower heal times" + ", but can also crush your bricks if you don't have " + "the storage hardware to support this."}, + { + .key = {"shd-wait-qlength"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 655536, + .default_value = "1024", + .op_version = {GD_OP_VERSION_3_7_12}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This option can be used to control number of heals" + " that can wait in SHD per subvolume", + }, + { + .key = {"locking-scheme"}, + .type = GF_OPTION_TYPE_STR, + .value = {"full", "granular"}, + .default_value = "full", + .op_version = {GD_OP_VERSION_3_7_12}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "If this option is set to granular, self-heal will " + "stop being compatible with afr-v1, which helps afr " + "be more granular while self-healing", + }, + {.key = {"full-lock"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "yes", + .op_version = {GD_OP_VERSION_3_13_2}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, + .tags = {"replicate"}, + .description = "If this option is disabled, then the IOs will take " + "range locks same as versions till 3.13.1."}, + { + .key = {"granular-entry-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .op_version = {GD_OP_VERSION_3_8_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "If this option is enabled, self-heal will resort to " + "granular way of recording changelogs and doing entry " + "self-heal.", + }, + { + .key = {"favorite-child-policy"}, + .type = GF_OPTION_TYPE_STR, + .value = {"none", "size", "ctime", "mtime", "majority"}, + .default_value = "none", + .op_version = {GD_OP_VERSION_3_7_12}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This option can be used to automatically resolve " + "split-brains using various policies without user " + "intervention. \"size\" picks the file with the " + "biggest size as the source. \"ctime\" and \"mtime\" " + "pick the file with the latest ctime and mtime " + "respectively as the source. \"majority\" picks a file" + " with identical mtime and size in more than half the " + "number of bricks in the replica.", + }, + { + .key = {"consistent-io"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .description = "If this option is enabled, i/o will fail even if " + "one of the bricks is down in the replicas", + }, + {.key = {"use-compound-fops"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .op_version = {GD_OP_VERSION_3_8_4}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"replicate"}, + .description = "This option exists only for backward compatibility " + "and configuring it doesn't have any effect"}, + {.key = {"use-anonymous-inode"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .op_version = {GD_OP_VERSION_8_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, + .tags = {"replicate"}, + .description = "Setting this option heals directory renames efficiently"}, + + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "replicate", + .category = GF_MAINTAINED, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index e6cd9bf4905..d62f9a9caf2 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -1,943 +1,1423 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef __AFR_H__ #define __AFR_H__ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "call-stub.h" -#include "compat-errno.h" +#include <glusterfs/call-stub.h> +#include <glusterfs/compat-errno.h> #include "afr-mem-types.h" #include "libxlator.h" +#include <glusterfs/timer.h> +#include <glusterfs/syncop.h> + +#include "afr-self-heald.h" +#include "afr-messages.h" + +#define SHD_INODE_LRU_LIMIT 1 +#define AFR_PATHINFO_HEADER "REPLICATE:" +#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size" +#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal" +#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty" +#define AFR_DIRTY (((afr_private_t *)(THIS->private))->afr_dirty) + +#define AFR_LOCKEE_COUNT_MAX 3 +#define AFR_DOM_COUNT_MAX 3 +#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ +#define AFR_DEFAULT_SPB_CHOICE_TIMEOUT 300 /*in seconds*/ + +#define ARBITER_BRICK_INDEX 2 +#define THIN_ARBITER_BRICK_INDEX 2 +#define AFR_TA_DOM_NOTIFY "afr.ta.dom-notify" +#define AFR_TA_DOM_MODIFY "afr.ta.dom-modify" + +#define AFR_LK_HEAL_DOM "afr.lock-heal.domain" + +#define AFR_HALO_MAX_LATENCY 99999 +#define AFR_ANON_DIR_PREFIX ".glusterfs-anonymous-inode" + +#define PFLAG_PENDING (1 << 0) +#define PFLAG_SBRAIN (1 << 1) + +typedef int (*afr_lock_cbk_t)(call_frame_t *frame, xlator_t *this); + +typedef int (*afr_read_txn_wind_t)(call_frame_t *frame, xlator_t *this, + int subvol); + +typedef int (*afr_inode_refresh_cbk_t)(call_frame_t *frame, xlator_t *this, + int err); + +typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this); + +#define AFR_COUNT(array, max) \ + ({ \ + int __i; \ + int __res = 0; \ + for (__i = 0; __i < max; __i++) \ + if (array[__i]) \ + __res++; \ + __res; \ + }) +#define AFR_INTERSECT(dst, src1, src2, max) \ + ({ \ + int __i; \ + for (__i = 0; __i < max; __i++) \ + dst[__i] = src1[__i] && src2[__i]; \ + }) +#define AFR_CMP(a1, a2, len) \ + ({ \ + int __cmp = 0; \ + int __i; \ + for (__i = 0; __i < len; __i++) \ + if (a1[__i] != a2[__i]) { \ + __cmp = 1; \ + break; \ + } \ + __cmp; \ + }) +#define AFR_IS_ARBITER_BRICK(priv, index) \ + ((priv->arbiter_count == 1) && (index == ARBITER_BRICK_INDEX)) + +#define AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(ret, errnum) \ + do { \ + local->op_ret = ret; \ + local->op_errno = errnum; \ + if (local->op_errno == EIO) \ + gf_msg(this->name, GF_LOG_ERROR, local->op_errno, \ + AFR_MSG_SPLIT_BRAIN, \ + "Failing %s on gfid %s: " \ + "split-brain observed.", \ + gf_fop_list[local->op], uuid_utoa(local->inode->gfid)); \ + } while (0) + +#define AFR_ERROR_OUT_IF_FDCTX_INVALID(__fd, __this, __error, __label) \ + do { \ + afr_fd_ctx_t *__fd_ctx = NULL; \ + __fd_ctx = afr_fd_ctx_get(__fd, __this); \ + if (__fd_ctx && __fd_ctx->is_fd_bad) { \ + __error = EBADF; \ + goto __label; \ + } \ + } while (0) -#define AFR_XATTR_PREFIX "trusted.afr" +typedef enum { + AFR_READ_POLICY_FIRST_UP, + AFR_READ_POLICY_GFID_HASH, + AFR_READ_POLICY_GFID_PID_HASH, + AFR_READ_POLICY_LESS_LOAD, + AFR_READ_POLICY_LEAST_LATENCY, + AFR_READ_POLICY_LOAD_LATENCY_HYBRID, +} afr_read_hash_mode_t; -struct _pump_private; +typedef enum { + AFR_FAV_CHILD_NONE, + AFR_FAV_CHILD_BY_SIZE, + AFR_FAV_CHILD_BY_CTIME, + AFR_FAV_CHILD_BY_MTIME, + AFR_FAV_CHILD_BY_MAJORITY, + AFR_FAV_CHILD_POLICY_MAX, +} afr_favorite_child_policy; -typedef struct _afr_private { - gf_lock_t lock; /* to guard access to child_count, etc */ - unsigned int child_count; /* total number of children */ +typedef enum { + AFR_SELFHEAL_DATA_FULL = 0, + AFR_SELFHEAL_DATA_DIFF, + AFR_SELFHEAL_DATA_DYNAMIC, +} afr_data_self_heal_type_t; - unsigned int read_child_rr; /* round-robin index of the read_child */ - gf_lock_t read_child_lock; /* lock to protect above */ +typedef enum { + AFR_CHILD_UNKNOWN = -1, + AFR_CHILD_ZERO, + AFR_CHILD_ONE, + AFR_CHILD_THIN_ARBITER, +} afr_child_index; - xlator_t **children; +typedef enum { + TA_WAIT_FOR_NOTIFY_LOCK_REL, /*FOP came after notify domain lock upcall + notification and waiting for its release.*/ + TA_GET_INFO_FROM_TA_FILE, /*FOP needs post-op on ta file to get + *info about which brick is bad.*/ + TA_INFO_IN_MEMORY_SUCCESS, /*Bad brick info is in memory and fop failed + *on BAD brick - Success*/ + TA_INFO_IN_MEMORY_FAILED, /*Bad brick info is in memory and fop failed + *on GOOD brick - Failed*/ + TA_SUCCESS, /*FOP succeeded on both data bricks.*/ +} afr_ta_fop_state_t; + +struct afr_nfsd { + uint32_t halo_max_latency_msec; + gf_boolean_t iamnfsd; +}; + +typedef struct _afr_lk_heal_info { + fd_t *fd; + int32_t cmd; + struct gf_flock flock; + dict_t *xdata_req; + unsigned char *locked_nodes; + struct list_head pos; + gf_lkowner_t lk_owner; + pid_t pid; + int32_t *child_up_event_gen; + int32_t *child_down_event_gen; +} afr_lk_heal_info_t; - gf_lock_t root_inode_lk; - int first_lookup; - inode_t *root_inode; +typedef struct _afr_private { + gf_lock_t lock; /* to guard access to child_count, etc */ + unsigned int child_count; /* total number of children */ + unsigned int arbiter_count; /*subset of child_count. + Has to be 0 or 1.*/ + + xlator_t **children; + + inode_t *root_inode; + + int favorite_child; /* subvolume to be preferred in resolving + split-brain cases */ + /* For thin-arbiter. */ + uuid_t ta_gfid; + unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/ + int ta_bad_child_index; + int ta_event_gen; + unsigned int ta_in_mem_txn_count; + unsigned int ta_on_wire_txn_count; + struct list_head ta_waitq; + struct list_head ta_onwireq; + + unsigned char *anon_inode; + unsigned char *child_up; + unsigned char *halo_child_up; + int64_t *child_latency; + unsigned char *local; + + char **pending_key; + + afr_data_self_heal_type_t data_self_heal_algorithm; + unsigned int data_self_heal_window_size; /* max number of pipelined + read/writes */ + + struct list_head heal_waiting; /*queue for files that need heal*/ + uint32_t heal_wait_qlen; /*configurable queue length for heal_waiting*/ + int32_t heal_waiters; /* No. of elements currently in wait queue.*/ + + struct list_head healing; /* queue for files that are undergoing + background heal*/ + uint32_t background_self_heal_count; /*configurable queue length for + healing queue*/ + int32_t healers; /* No. of elements currently undergoing background + heal*/ + + gf_boolean_t release_ta_notify_dom_lock; + + gf_boolean_t metadata_self_heal; /* on/off */ + gf_boolean_t entry_self_heal; /* on/off */ + + gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ + int read_child; /* read-subvolume */ + gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/ + + gf_timer_t *timer; /* launched when parent up is received */ + + unsigned int wait_count; /* # of servers to wait for success */ + + unsigned char ta_child_up; + gf_boolean_t optimistic_change_log; + gf_boolean_t eager_lock; + gf_boolean_t pre_op_compat; /* on/off */ + uint32_t post_op_delay_secs; + unsigned int quorum_count; + + off_t ta_notify_dom_lock_offset; + afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic + resolution of split-brains.*/ + afr_read_hash_mode_t hash_mode; /* for when read_child is not set */ + + int32_t *last_event; + + /* @event_generation: Keeps count of number of events received which can + potentially impact consistency decisions. The events are CHILD_UP + and CHILD_DOWN, when we have to recalculate the freshness/staleness + of copies to detect if changes had happened while the other server + was down. CHILD_DOWN and CHILD_UP can also be received on network + disconnect/reconnects and not necessarily server going down/up. + Recalculating freshness/staleness on network events is equally + important as we might have had a network split brain. + */ + uint32_t event_generation; + char vol_uuid[UUID_SIZE + 1]; + + gf_boolean_t choose_local; + gf_boolean_t did_discovery; + gf_boolean_t ensure_durability; + gf_boolean_t halo_enabled; + gf_boolean_t consistent_metadata; + gf_boolean_t need_heal; + gf_boolean_t granular_locks; + uint64_t sh_readdir_size; + char *sh_domain; + char *afr_dirty; + + uint64_t spb_choice_timeout; + + afr_self_heald_t shd; + struct afr_nfsd nfsd; + + uint32_t halo_max_latency_msec; + uint32_t halo_max_replicas; + uint32_t halo_min_replicas; + + gf_boolean_t full_lock; + gf_boolean_t esh_granular; + gf_boolean_t consistent_io; + gf_boolean_t data_self_heal; /* on/off */ + gf_boolean_t use_anon_inode; + + /*For lock healing.*/ + struct list_head saved_locks; + struct list_head lk_healq; + + /*For anon-inode handling */ + char anon_inode_name[NAME_MAX + 1]; + char anon_gfid_str[UUID_SIZE + 1]; +} afr_private_t; - unsigned char *child_up; +typedef enum { + AFR_DATA_TRANSACTION, /* truncate, write, ... */ + AFR_METADATA_TRANSACTION, /* chmod, chown, ... */ + AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */ + AFR_ENTRY_RENAME_TRANSACTION, /* rename */ +} afr_transaction_type; - char **pending_key; +/* + xattr format: trusted.afr.volume = [x y z] + x - data pending + y - metadata pending + z - entry pending +*/ - gf_boolean_t data_self_heal; /* on/off */ - char * data_self_heal_algorithm; /* name of algorithm */ - unsigned int data_self_heal_window_size; /* max number of pipelined - read/writes */ +static inline int +afr_index_for_transaction_type(afr_transaction_type type) +{ + switch (type) { + case AFR_DATA_TRANSACTION: + return 0; - unsigned int background_self_heal_count; - unsigned int background_self_heals_started; - gf_boolean_t metadata_self_heal; /* on/off */ - gf_boolean_t entry_self_heal; /* on/off */ + case AFR_METADATA_TRANSACTION: + return 1; - gf_boolean_t data_change_log; /* on/off */ - gf_boolean_t metadata_change_log; /* on/off */ - gf_boolean_t entry_change_log; /* on/off */ + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + return 2; + } - int read_child; /* read-subvolume */ - unsigned int favorite_child; /* subvolume to be preferred in resolving - split-brain cases */ + return -1; /* make gcc happy */ +} - unsigned int data_lock_server_count; - unsigned int metadata_lock_server_count; - unsigned int entry_lock_server_count; +static inline int +afr_index_from_ia_type(ia_type_t type) +{ + switch (type) { + case IA_IFDIR: + return afr_index_for_transaction_type(AFR_ENTRY_TRANSACTION); + case IA_IFREG: + return afr_index_for_transaction_type(AFR_DATA_TRANSACTION); + default: + return -1; + } +} - gf_boolean_t inodelk_trace; - gf_boolean_t entrylk_trace; +typedef struct { + struct gf_flock flock; + loc_t loc; + fd_t *fd; + char *basename; + unsigned char *locked_nodes; + int locked_count; - gf_boolean_t strict_readdir; +} afr_lockee_t; - unsigned int wait_count; /* # of servers to wait for success */ +int +afr_entry_lockee_cmp(const void *l1, const void *l2); - uint64_t up_count; /* number of CHILD_UPs we have seen */ - uint64_t down_count; /* number of CHILD_DOWNs we have seen */ +typedef struct { + loc_t *lk_loc; - struct _pump_private *pump_private; /* Set if we are loaded as pump */ - int use_afr_in_pump; + afr_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; - pthread_mutex_t mutex; - struct list_head saved_fds; /* list of fds on which locks have succeeded */ - gf_boolean_t optimistic_change_log; + const char *lk_basename; + const char *lower_basename; + const char *higher_basename; - char vol_uuid[UUID_SIZE + 1]; -} afr_private_t; + unsigned char *lower_locked_nodes; -typedef struct { - /* External interface: These are variables (some optional) that - are set by whoever has triggered self-heal */ + afr_lock_cbk_t lock_cbk; - gf_boolean_t need_data_self_heal; - gf_boolean_t need_metadata_self_heal; - gf_boolean_t need_entry_self_heal; + int lockee_count; - gf_boolean_t forced_merge; /* Is this a self-heal triggered to - forcibly merge the directories? */ + int32_t lk_call_count; + int32_t lk_expected_count; + int32_t lk_attempted_count; - gf_boolean_t healing_fd_opened; /* true if caller has already - opened fd */ + int32_t lock_op_ret; + int32_t lock_op_errno; + char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ + int32_t lock_count; + char lower_locked; + char higher_locked; +} afr_internal_lock_t; - gf_boolean_t data_lock_held; /* true if caller has already - acquired 0-0 lock */ +struct afr_reply { + int valid; + int32_t op_ret; + dict_t *xattr; /*For xattrop*/ + dict_t *xdata; + struct iatt poststat; + struct iatt postparent; + struct iatt prestat; + struct iatt preparent; + struct iatt preparent2; + struct iatt postparent2; + int32_t op_errno; + /* For rchecksum */ + uint8_t checksum[SHA256_DIGEST_LENGTH]; + gf_boolean_t buf_has_zeroes; + gf_boolean_t fips_mode_rchecksum; + /* For lookup */ + int8_t need_heal; +}; - fd_t *healing_fd; /* set if callers has opened fd */ +typedef enum { + AFR_FD_NOT_OPENED, + AFR_FD_OPENED, + AFR_FD_OPENING +} afr_fd_open_status_t; - gf_boolean_t background; /* do self-heal in background - if possible */ +typedef struct { + afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ + int flags; + + /* the subvolume on which the latest sequence of readdirs (starting + at offset 0) has begun. Till the next readdir request with 0 offset + arrives, we continue to read off this subvol. + */ + int readdir_subvol; + /* lock-healing related members. */ + gf_boolean_t is_fd_bad; + afr_lk_heal_info_t *lk_heal_info; - ia_type_t type; /* st_mode of the entry we're doing - self-heal on */ +} afr_fd_ctx_t; - /* Function to call to unwind. If self-heal is being done in the - background, this function will be called as soon as possible. */ +typedef enum { + AFR_FOP_LOCK_PARALLEL, + AFR_FOP_LOCK_SERIAL, + AFR_FOP_LOCK_QUORUM_FAILED, +} afr_fop_lock_state_t; + +typedef struct _afr_inode_lock_t { + /* @num_inodelks: + Number of inodelks queried from the server, as queried through + xdata in FOPs. Currently, used to decide if eager-locking must be + temporarily disabled. + */ + int32_t num_inodelks; + unsigned int event_generation; + gf_timer_t *delay_timer; + struct list_head owners; /*Transactions that are performing fop*/ + struct list_head post_op; /*Transactions that are done with the fop + *So can not conflict with the fops*/ + struct list_head waiting; /*Transaction that are waiting for + *conflicting transactions to complete*/ + struct list_head frozen; /*Transactions that need to go as part of + * next batch of eager-lock*/ + gf_boolean_t release; + gf_boolean_t acquired; +} afr_lock_t; + +typedef struct _afr_inode_ctx { + uint64_t read_subvol; + uint64_t write_subvol; + int lock_count; + int spb_choice; + gf_timer_t *timer; + unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; + int inherited[AFR_NUM_CHANGE_LOGS]; + int on_disk[AFR_NUM_CHANGE_LOGS]; + /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/ + afr_lock_t lock[2]; + + /* @open_fd_count: + Number of open FDs queried from the server, as queried through + xdata in FOPs. Currently, used to decide if eager-locking must be + temporarily disabled. + */ + uint32_t open_fd_count; + gf_boolean_t need_refresh; + + /* set if any write on this fd was a non stable write + (i.e, without O_SYNC or O_DSYNC) + */ + gf_boolean_t witnessed_unstable_write; +} afr_inode_ctx_t; - int (*unwind) (call_frame_t *frame, xlator_t *this); +typedef struct _afr_local { + glusterfs_fop_t op; + unsigned int call_count; - /* End of external interface members */ + /* @event_generation: copy of priv->event_generation taken at the + time of starting the transaction. The copy is made so that we + have a stable value through the various phases of the transaction. + */ + unsigned int event_generation; + uint32_t open_fd_count; + int32_t num_inodelks; - /* array of stat's, one for each child */ - struct iatt *buf; - struct iatt parentbuf; - struct iatt entrybuf; + int32_t op_ret; + int32_t op_errno; - /* array of xattr's, one for each child */ - dict_t **xattr; + int dirty[AFR_NUM_CHANGE_LOGS]; - /* array of errno's, one for each child */ - int *child_errno; + int32_t **pending; - int32_t **pending_matrix; - int32_t **delta_matrix; + loc_t loc; + loc_t newloc; - int *sources; - int source; - int active_source; - int active_sinks; - int *success; - unsigned char *locked_nodes; - int lock_count; + fd_t *fd; + afr_fd_ctx_t *fd_ctx; - mode_t impunging_entry_mode; - const char *linkname; + /* @child_up: copy of priv->child_up taken at the time of transaction + start. The copy is taken so that we have a stable child_up array + through the phases of the transaction as priv->child_up[i] can keep + changing through time. + */ + unsigned char *child_up; - int op_failed; + /* @read_attempted: + array of flags representing subvolumes where read operations of + the read transaction have already been attempted. The array is + first pre-filled with down subvolumes, and as reads are performed + on other subvolumes, those are set as well. This way if the read + operation fails we do not retry on that subvolume again. + */ + unsigned char *read_attempted; - int file_has_holes; - blksize_t block_size; - off_t file_size; - off_t offset; + /* @readfn: - loc_t parent_loc; + pointer to function which will perform the read operation on a given + subvolume. Used in read transactions. + */ - call_frame_t *orig_frame; - gf_boolean_t unwound; + afr_read_txn_wind_t readfn; - /* private data for the particular self-heal algorithm */ - void *private; + /* @inode: - int (*flush_self_heal_cbk) (call_frame_t *frame, xlator_t *this); + the inode on which the read txn is performed on. ref'ed and copied + from either fd->inode or loc.inode + */ - int (*completion_cbk) (call_frame_t *frame, xlator_t *this); - int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); - int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); + inode_t *inode; - call_frame_t *sh_frame; -} afr_self_heal_t; + /* @parent[2]: + parent inode[s] on which directory transactions are performed. + */ -typedef enum { - AFR_DATA_TRANSACTION, /* truncate, write, ... */ - AFR_METADATA_TRANSACTION, /* chmod, chown, ... */ - AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */ - AFR_ENTRY_RENAME_TRANSACTION, /* rename */ -} afr_transaction_type; + inode_t *parent; + inode_t *parent2; -typedef enum { - AFR_TRANSACTION_LK, - AFR_SELFHEAL_LK, -} transaction_lk_type_t; + /* @readable: -typedef enum { - AFR_LOCK_OP, - AFR_UNLOCK_OP, -} afr_lock_op_type_t; + array of flags representing servers from which a read can be + performed. This is the output of afr_inode_refresh() + */ + unsigned char *readable; + unsigned char *readable2; /*For rename transaction*/ -typedef enum { - AFR_DATA_SELF_HEAL_LK, - AFR_METADATA_SELF_HEAL_LK, - AFR_ENTRY_SELF_HEAL_LK, -}selfheal_lk_type_t; + afr_inode_refresh_cbk_t refreshfn; -typedef enum { - AFR_INODELK_TRANSACTION, - AFR_INODELK_NB_TRANSACTION, - AFR_ENTRYLK_TRANSACTION, - AFR_ENTRYLK_NB_TRANSACTION, - AFR_INODELK_SELFHEAL, - AFR_INODELK_NB_SELFHEAL, - AFR_ENTRYLK_SELFHEAL, - AFR_ENTRYLK_NB_SELFHEAL, -} afr_lock_call_type_t; + /* @refreshinode: -/* - xattr format: trusted.afr.volume = [x y z] - x - data pending - y - metadata pending - z - entry pending -*/ + Inode currently getting refreshed. + */ + inode_t *refreshinode; -static inline int -afr_index_for_transaction_type (afr_transaction_type type) -{ - switch (type) { + dict_t *xattr_req; - case AFR_DATA_TRANSACTION: - return 0; + dict_t *dict; - case AFR_METADATA_TRANSACTION: - return 1; + int read_subvol; /* Current read subvolume */ - case AFR_ENTRY_TRANSACTION: - case AFR_ENTRY_RENAME_TRANSACTION: - return 2; - } + int optimistic_change_log; - return -1; /* make gcc happy */ -} + afr_internal_lock_t internal_lock; + /*To handle setattr/setxattr on yet to be linked inode from dht*/ + uuid_t refreshgfid; -typedef struct { - loc_t *lk_loc; - struct gf_flock lk_flock; + /* @refreshed: - const char *lk_basename; - const char *lower_basename; - const char *higher_basename; - char lower_locked; - char higher_locked; + the inode was "refreshed" (i.e, pending xattrs from all subvols + freshly inspected and inode ctx updated accordingly) as part of + this transaction already. + */ + gf_boolean_t refreshed; - unsigned char *locked_nodes; - unsigned char *lower_locked_nodes; - unsigned char *inode_locked_nodes; - unsigned char *entry_locked_nodes; + gf_boolean_t update_num_inodelks; + gf_boolean_t update_open_fd_count; - selfheal_lk_type_t selfheal_lk_type; - transaction_lk_type_t transaction_lk_type; + /* + @pre_op_compat: - int32_t lock_count; - int32_t inodelk_lock_count; - int32_t entrylk_lock_count; + compatibility mode of pre-op. send a separate pre-op and + op operations as part of transaction, rather than combining + */ - uint64_t lock_number; - int32_t lk_call_count; + gf_boolean_t pre_op_compat; - int32_t lock_op_ret; - int32_t lock_op_errno; + /* Is the current writev() going to perform a stable write? + i.e, is fd->flags or @flags writev param have O_SYNC or + O_DSYNC? + */ + gf_boolean_t stable_write; + + /* This write appended to the file. Nnot necessarily O_APPEND, + just means the offset of write was at the end of file. + */ + gf_boolean_t append_write; + + /* + This struct contains the arguments for the "continuation" + (scheme-like) of fops + */ + + struct { + struct { + struct statvfs buf; + unsigned char buf_set; + } statfs; + + struct { + fd_t *fd; + int32_t flags; + } open; + + struct { + struct gf_flock user_flock; + struct gf_flock ret_flock; + unsigned char *locked_nodes; + int32_t cmd; + /*For lock healing only.*/ + unsigned char *dom_locked_nodes; + int32_t *dom_lock_op_ret; + int32_t *dom_lock_op_errno; + struct gf_flock *getlk_rsp; + } lk; + + /* inode read */ + + struct { + int32_t mask; + int last_index; /* index of the child we tried previously */ + } access; + + struct { + int last_index; + } stat; + + struct { + int last_index; + } fstat; + + struct { + size_t size; + int last_index; + } readlink; + + struct { + char *name; + long xattr_len; + int last_index; + } getxattr; + + struct { + size_t size; + off_t offset; + int last_index; + uint32_t flags; + } readv; + + /* dir read */ + + struct { + uint32_t *checksum; + int success_count; + int32_t op_ret; + int32_t op_errno; + } opendir; + + struct { + int32_t op_ret; + int32_t op_errno; + size_t size; + off_t offset; + dict_t *dict; + int last_index; + gf_boolean_t failed; + } readdir; + /* inode write */ + + struct { + struct iatt prebuf; + struct iatt postbuf; + } inode_wfop; // common structure for all inode-write-fops + + struct { + struct iovec *vector; + struct iobref *iobref; + off_t offset; + int32_t op_ret; + int32_t count; + uint32_t flags; + } writev; + + struct { + off_t offset; + } truncate; + + struct { + off_t offset; + } ftruncate; + + struct { + struct iatt in_buf; + int32_t valid; + } setattr; + + struct { + struct iatt in_buf; + int32_t valid; + } fsetattr; + + struct { + dict_t *dict; + int32_t flags; + } setxattr; + + struct { + dict_t *dict; + int32_t flags; + } fsetxattr; + + struct { + char *name; + } removexattr; + + struct { + dict_t *xattr; + gf_xattrop_flags_t optype; + } xattrop; + + /* dir write */ + + struct { + inode_t *inode; + struct iatt buf; + struct iatt preparent; + struct iatt postparent; + struct iatt prenewparent; + struct iatt postnewparent; + } dir_fop; // common structure for all dir fops + + struct { + fd_t *fd; + dict_t *params; + int32_t flags; + mode_t mode; + } create; + + struct { + dict_t *params; + dev_t dev; + mode_t mode; + } mknod; + + struct { + dict_t *params; + int32_t mode; + } mkdir; + + struct { + dict_t *params; + char *linkpath; + } symlink; + + struct { + off_t offset; + size_t len; + int32_t mode; + } fallocate; + + struct { + off_t offset; + size_t len; + } discard; + + struct { + off_t offset; + off_t len; + struct iatt prebuf; + struct iatt postbuf; + } zerofill; + + struct { + char *volume; + int32_t cmd; + int32_t in_cmd; + struct gf_flock in_flock; + struct gf_flock flock; + void *xdata; + } inodelk; + + struct { + char *volume; + char *basename; + void *xdata; + entrylk_cmd in_cmd; + entrylk_cmd cmd; + entrylk_type type; + } entrylk; + + struct { + off_t offset; + gf_seek_what_t what; + } seek; + + struct { + struct gf_lease user_lease; + struct gf_lease ret_lease; + unsigned char *locked_nodes; + } lease; + + struct { + int flags; + } rmdir; + + struct { + int32_t datasync; + } fsync; + + struct { + uuid_t gfid_req; + gf_boolean_t needs_fresh_lookup; + } lookup; + + } cont; + + struct { + char *basename; + char *new_basename; + + loc_t parent_loc; + loc_t new_parent_loc; + + /* stub to resume on destruction + of the transaction frame */ + call_stub_t *resume_stub; + + struct list_head owner_list; + struct list_head wait_list; + + unsigned char *pre_op; + + /* Changelog xattr dict for [f]xattrop*/ + dict_t **changelog_xdata; + unsigned char *pre_op_sources; + + /* @failed_subvols: subvolumes on which a pre-op or a + FOP failed. */ + unsigned char *failed_subvols; + + call_frame_t *main_frame; /*Fop frame*/ + call_frame_t *frame; /*Transaction frame*/ + + int (*wind)(call_frame_t *frame, xlator_t *this, int subvol); + + int (*unwind)(call_frame_t *frame, xlator_t *this); + + off_t start, len; + + afr_transaction_type type; + + int32_t in_flight_sb_errno; /* This is where the cause of the + failure on the last good copy of + the file is stored. + */ + + /* @changelog_resume: function to be called after changlogging + (either pre-op or post-op) is done + */ + afr_changelog_resume_t changelog_resume; + + gf_boolean_t eager_lock_on; + gf_boolean_t do_eager_unlock; + + /* @dirtied: flag which indicates whether we set dirty flag + in the OP. Typically true when we are performing operation + on more than one subvol and optimistic changelog is disabled + + A 'true' value set in @dirtied flag means an 'undirtying' + has to be done in POST-OP phase. + */ + gf_boolean_t dirtied; + + /* @inherited: flag which indicates that the dirty flags + of the previous transaction were inherited + */ + gf_boolean_t inherited; + + /* + @no_uninherit: flag which indicates that a pre_op_uninherit() + must _not_ be attempted (and returned as failure) always. This + flag is set when a hard pre-op is performed, but not accounted + for it in fd_ctx->on_disk[]. Such transactions are "isolated" + from the pre-op piggybacking entirely and therefore uninherit + must not be attempted. + */ + gf_boolean_t no_uninherit; + + gf_boolean_t in_flight_sb; /* Indicator for occurrence of + split-brain while in the middle of + a txn. */ + + /* @uninherit_done: + @uninherit_value: + + The above pair variables make pre_op_uninherit() idempotent. + Both are FALSE initially. The first call to pre_op_uninherit + sets @uninherit_done to TRUE and the return value to + @uninherit_value. Further calls will check for @uninherit_done + to be TRUE and if so will simply return @uninherit_value. + */ + gf_boolean_t uninherit_done; + gf_boolean_t uninherit_value; + + gf_boolean_t disable_delayed_post_op; + } transaction; + + syncbarrier_t barrier; - int (*lock_cbk) (call_frame_t *, xlator_t *); + /* extra data for fops */ + dict_t *xdata_req; + dict_t *xdata_rsp; -} afr_internal_lock_t; + dict_t *xattr_rsp; /*for [f]xattrop*/ + + mode_t umask; + int xflag; + struct afr_reply *replies; + + /* For client side background heals. */ + struct list_head healer; + call_frame_t *heal_frame; + + afr_inode_ctx_t *inode_ctx; + + /*For thin-arbiter transactions.*/ + int ta_failed_subvol; + int ta_event_gen; + struct list_head ta_waitq; + struct list_head ta_onwireq; + afr_ta_fop_state_t fop_state; + afr_fop_lock_state_t fop_lock_state; + gf_lkowner_t saved_lk_owner; + unsigned char read_txn_query_child; + unsigned char ta_child_up; + gf_boolean_t do_discovery; + gf_boolean_t need_full_crawl; + gf_boolean_t is_read_txn; + gf_boolean_t is_new_entry; +} afr_local_t; -typedef struct _afr_locked_fd { - fd_t *fd; - struct list_head list; -} afr_locked_fd_t; +typedef struct afr_spbc_timeout { + call_frame_t *frame; + loc_t *loc; + int spb_child_index; + gf_boolean_t d_spb; + gf_boolean_t m_spb; +} afr_spbc_timeout_t; + +typedef struct afr_spb_status { + call_frame_t *frame; + loc_t *loc; +} afr_spb_status_t; + +typedef struct afr_empty_brick_args { + call_frame_t *frame; + char *op_type; + loc_t loc; + int empty_index; +} afr_empty_brick_args_t; + +typedef struct afr_read_subvol_args { + ia_type_t ia_type; + uuid_t gfid; +} afr_read_subvol_args_t; + +typedef struct afr_granular_esh_args { + fd_t *heal_fd; + xlator_t *xl; + call_frame_t *frame; + gf_boolean_t mismatch; /* flag to represent occurrence of type/gfid + mismatch */ +} afr_granular_esh_args_t; -typedef struct _afr_local { - int uid; - int gid; - unsigned int call_count; - unsigned int success_count; - unsigned int enoent_count; +int +afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, int type); +int +afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int *event_generation); +int +__afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int *event_generation); +int +__afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvol, + int event_generation); +int +afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int event_generation); - unsigned int govinda_gOvinda; +int +__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this); - unsigned int read_child_index; - unsigned char read_child_returned; - unsigned int first_up_child; +int +afr_inode_need_refresh_set(inode_t *inode, xlator_t *this); - pid_t saved_pid; +int +afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this, + unsigned char *readable, + afr_read_subvol_args_t *args); - int32_t op_ret; - int32_t op_errno; +int +afr_inode_read_subvol_type_get(inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, int type); +int +afr_read_subvol_get(inode_t *inode, xlator_t *this, int *subvol_p, + unsigned char *readables, int *event_p, + afr_transaction_type type, afr_read_subvol_args_t *args); - int32_t **pending; +#define afr_data_subvol_get(i, t, s, r, e, a) \ + afr_read_subvol_get(i, t, s, r, e, AFR_DATA_TRANSACTION, a) - loc_t loc; - loc_t newloc; +#define afr_metadata_subvol_get(i, t, s, r, e, a) \ + afr_read_subvol_get(i, t, s, r, e, AFR_METADATA_TRANSACTION, a) - fd_t *fd; +int +afr_inode_refresh(call_frame_t *frame, xlator_t *this, inode_t *inode, + uuid_t gfid, afr_inode_refresh_cbk_t cbk); - glusterfs_fop_t fop; +int32_t +afr_notify(xlator_t *this, int32_t event, void *data, void *data2); - unsigned char *child_up; +int +xattr_is_equal(dict_t *this, char *key1, data_t *value1, void *data); - int32_t *child_errno; +int +afr_add_entry_lockee(afr_local_t *local, loc_t *loc, char *basename, + int child_count); - dict_t *xattr_req; +int +afr_add_inode_lockee(afr_local_t *local, int child_count); - int32_t inodelk_count; - int32_t entrylk_count; +void +afr_lockees_cleanup(afr_internal_lock_t *int_lock); - afr_internal_lock_t internal_lock; +int +afr_attempt_lock_recovery(xlator_t *this, int32_t child_index); - afr_locked_fd_t *locked_fd; - int32_t source_child; - int32_t lock_recovery_child; +int +afr_mark_locked_nodes(xlator_t *this, fd_t *fd, unsigned char *locked_nodes); - dict_t *dict; - int optimistic_change_log; +void +afr_set_lk_owner(call_frame_t *frame, xlator_t *this, void *lk_owner); - int (*openfd_flush_cbk) (call_frame_t *frame, xlator_t *this); +int +afr_set_lock_number(call_frame_t *frame, xlator_t *this); - /* - This struct contains the arguments for the "continuation" - (scheme-like) of fops - */ +int32_t +afr_unlock(call_frame_t *frame, xlator_t *this); - int op; - struct { - struct { - unsigned char buf_set; - struct statvfs buf; - } statfs; +int +afr_lock_nonblocking(call_frame_t *frame, xlator_t *this); - struct { - inode_t *inode; - struct iatt buf; - struct iatt read_child_buf; - struct iatt postparent; - ino_t ino; - uint64_t gen; - ino_t parent_ino; - dict_t *xattr; - dict_t **xattrs; - gf_boolean_t is_revalidate; - } lookup; +int +afr_blocking_lock(call_frame_t *frame, xlator_t *this); - struct { - int32_t flags; - int32_t wbflags; - } open; - - struct { - int32_t cmd; - struct gf_flock user_flock; - struct gf_flock ret_flock; - unsigned char *locked_nodes; - } lk; - - /* inode read */ - - struct { - int32_t mask; - int last_tried; /* index of the child we tried previously */ - } access; - - struct { - int last_tried; - ino_t ino; - } stat; - - struct { - int last_tried; - ino_t ino; - } fstat; - - struct { - size_t size; - int last_tried; - ino_t ino; - } readlink; - - struct { - char *name; - int last_tried; - } getxattr; - - struct { - ino_t ino; - size_t size; - off_t offset; - int last_tried; - } readv; - - /* dir read */ - - struct { - int success_count; - int32_t op_ret; - int32_t op_errno; - - uint32_t *checksum; - } opendir; - - struct { - int32_t op_ret; - int32_t op_errno; - size_t size; - off_t offset; - - gf_boolean_t failed; - int last_tried; - } readdir; - - struct { - int32_t op_ret; - int32_t op_errno; - - size_t size; - off_t offset; - int32_t flag; - - int last_tried; - } getdents; - - /* inode write */ - - struct { - ino_t ino; - struct iatt prebuf; - struct iatt postbuf; - - int32_t op_ret; - - struct iovec *vector; - struct iobref *iobref; - int32_t count; - off_t offset; - } writev; - - struct { - ino_t ino; - struct iatt prebuf; - struct iatt postbuf; - } fsync; - - struct { - ino_t ino; - off_t offset; - struct iatt prebuf; - struct iatt postbuf; - } truncate; - - struct { - ino_t ino; - off_t offset; - struct iatt prebuf; - struct iatt postbuf; - } ftruncate; - - struct { - ino_t ino; - struct iatt in_buf; - int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; - } setattr; - - struct { - ino_t ino; - struct iatt in_buf; - int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; - } fsetattr; - - struct { - dict_t *dict; - int32_t flags; - } setxattr; - - struct { - char *name; - } removexattr; - - /* dir write */ - - struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; - fd_t *fd; - dict_t *params; - int32_t flags; - mode_t mode; - inode_t *inode; - struct iatt buf; - struct iatt preparent; - struct iatt postparent; - struct iatt read_child_buf; - } create; - - struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; - dev_t dev; - mode_t mode; - dict_t *params; - inode_t *inode; - struct iatt buf; - struct iatt preparent; - struct iatt postparent; - struct iatt read_child_buf; - } mknod; - - struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; - int32_t mode; - dict_t *params; - inode_t *inode; - struct iatt buf; - struct iatt read_child_buf; - struct iatt preparent; - struct iatt postparent; - } mkdir; - - struct { - ino_t parent_ino; - int32_t op_ret; - int32_t op_errno; - struct iatt preparent; - struct iatt postparent; - } unlink; - - struct { - int flags; - ino_t parent_ino; - int32_t op_ret; - int32_t op_errno; - struct iatt preparent; - struct iatt postparent; - } rmdir; - - struct { - ino_t oldparent_ino; - ino_t newparent_ino; - ino_t ino; - struct iatt buf; - struct iatt read_child_buf; - struct iatt preoldparent; - struct iatt prenewparent; - struct iatt postoldparent; - struct iatt postnewparent; - } rename; - - struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; - inode_t *inode; - struct iatt buf; - struct iatt read_child_buf; - struct iatt preparent; - struct iatt postparent; - } link; - - struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; - inode_t *inode; - dict_t *params; - struct iatt buf; - struct iatt read_child_buf; - char *linkpath; - struct iatt preparent; - struct iatt postparent; - } symlink; - - struct { - int32_t flags; - dir_entry_t *entries; - int32_t count; - } setdents; - } cont; - - struct { - off_t start, len; - - char *basename; - char *new_basename; - - loc_t parent_loc; - loc_t new_parent_loc; - - afr_transaction_type type; - - int success_count; - int erase_pending; - int failure_count; - - int last_tried; - int32_t *child_errno; - - call_frame_t *main_frame; - - int (*fop) (call_frame_t *frame, xlator_t *this); - - int (*done) (call_frame_t *frame, xlator_t *this); - - int (*resume) (call_frame_t *frame, xlator_t *this); - - int (*unwind) (call_frame_t *frame, xlator_t *this); - - /* post-op hook */ - } transaction; - - afr_self_heal_t self_heal; - - struct marker_str marker; -} afr_local_t; +int +afr_internal_lock_finish(call_frame_t *frame, xlator_t *this); +int +__afr_fd_ctx_set(xlator_t *this, fd_t *fd); -typedef struct { - unsigned int *pre_op_done; - unsigned int *opened_on; /* which subvolumes the fd is open on */ - unsigned int *pre_op_piggyback; +afr_fd_ctx_t * +afr_fd_ctx_get(fd_t *fd, xlator_t *this); - int flags; - int32_t wbflags; - uint64_t up_count; /* number of CHILD_UPs this fd has seen */ - uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */ +int +afr_build_parent_loc(loc_t *parent, loc_t *child, int32_t *op_errno); - int32_t last_tried; +int +afr_locked_nodes_count(unsigned char *locked_nodes, int child_count); - int hit, miss; - gf_boolean_t failed_over; - struct list_head entries; /* needed for readdir failover */ +int +afr_replies_interpret(call_frame_t *frame, xlator_t *this, inode_t *inode, + gf_boolean_t *start_heal); - unsigned char *locked_on; /* which subvolumes locks have been successful */ -} afr_fd_ctx_t; +void +afr_local_replies_wipe(afr_local_t *local, afr_private_t *priv); +void +afr_local_cleanup(afr_local_t *local, xlator_t *this); + +int +afr_frame_return(call_frame_t *frame); + +int +afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata); -/* try alloc and if it fails, goto label */ -#define ALLOC_OR_GOTO(var, type, label) do { \ - var = GF_CALLOC (sizeof (type), 1, \ - gf_afr_mt_##type); \ - if (!var) { \ - gf_log (this->name, GF_LOG_ERROR, \ - "out of memory :("); \ - op_errno = ENOMEM; \ - goto label; \ - } \ - } while (0); +void +afr_local_transaction_cleanup(afr_local_t *local, xlator_t *this); +int +afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd); + +#define AFR_STACK_UNWIND(fop, frame, op_ret, op_errno, params...) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + int32_t __op_ret = 0; \ + int32_t __op_errno = 0; \ + \ + __op_ret = op_ret; \ + __op_errno = op_errno; \ + if (frame) { \ + __local = frame->local; \ + __this = frame->this; \ + afr_handle_inconsistent_fop(frame, &__op_ret, &__op_errno); \ + if (__local && __local->is_read_txn) \ + afr_pending_read_decrement(__this->private, \ + __local->read_subvol); \ + if (__local && __local->xdata_req && \ + afr_is_lock_mode_mandatory(__local->xdata_req)) \ + afr_dom_lock_release(frame); \ + frame->local = NULL; \ + } \ + \ + STACK_UNWIND_STRICT(fop, frame, __op_ret, __op_errno, params); \ + if (__local) { \ + afr_local_cleanup(__local, __this); \ + mem_put(__local); \ + } \ + } while (0) + +#define AFR_STACK_DESTROY(frame) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + STACK_DESTROY(frame->root); \ + if (__local) { \ + afr_local_cleanup(__local, __this); \ + mem_put(__local); \ + } \ + } while (0); + +#define AFR_FRAME_INIT(frame, op_errno) \ + ({ \ + frame->local = mem_get0(THIS->local_pool); \ + if (afr_local_init(frame->local, frame->this->private, &op_errno)) { \ + afr_local_cleanup(frame->local, frame->this); \ + mem_put(frame->local); \ + frame->local = NULL; \ + }; \ + frame->local; \ + }) + +#define AFR_STACK_RESET(frame) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + int __opr; \ + STACK_RESET(frame->root); \ + if (__local) { \ + afr_local_cleanup(__local, __this); \ + mem_put(__local); \ + } \ + AFR_FRAME_INIT(frame, __opr); \ + } while (0) -/* did a call fail due to a child failing? */ -#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ - ((op_errno == ENOTCONN) || \ - (op_errno == EBADFD))) +/* allocate and return a string that is the basename of argument */ +static inline char * +AFR_BASENAME(const char *str) +{ + char *__tmp_str = NULL; + char *__basename_str = NULL; + __tmp_str = gf_strdup(str); + __basename_str = gf_strdup(basename(__tmp_str)); + GF_FREE(__tmp_str); + return __basename_str; +} -#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1) +call_frame_t * +afr_copy_frame(call_frame_t *base); -/* have we tried all children? */ -#define all_tried(i, count) ((i) == (count) - 1) +int +afr_transaction_local_init(afr_local_t *local, xlator_t *this); int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid); +afr_marker_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, afr_local_t *local, afr_private_t *priv); int -pump_command_reply (call_frame_t *frame, xlator_t *this); +afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno); -int32_t -afr_notify (xlator_t *this, int32_t event, - void *data, ...); +int +afr_internal_lock_init(afr_internal_lock_t *lk, size_t child_count); int -afr_attempt_lock_recovery (xlator_t *this, int32_t child_index); +afr_higher_errno(int32_t old_errno, int32_t new_errno); int -afr_save_locked_fd (xlator_t *this, fd_t *fd); +afr_final_errno(afr_local_t *local, afr_private_t *priv); int -afr_mark_locked_nodes (xlator_t *this, fd_t *fd, - unsigned char *locked_nodes); +afr_xattr_req_prepare(xlator_t *this, dict_t *xattr_req); void -afr_set_lk_owner (call_frame_t *frame, xlator_t *this); +afr_fix_open(fd_t *fd, xlator_t *this); +afr_fd_ctx_t * +afr_fd_ctx_get(fd_t *fd, xlator_t *this); + +void +afr_set_low_priority(call_frame_t *frame); int -afr_set_lock_number (call_frame_t *frame, xlator_t *this); +afr_child_fd_ctx_set(xlator_t *this, fd_t *fd, int32_t child, int flags); +void +afr_matrix_cleanup(int32_t **pending, unsigned int m); -loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2); +int32_t ** +afr_matrix_create(unsigned int m, unsigned int n); -int32_t -afr_unlock (call_frame_t *frame, xlator_t *this); +int ** +afr_mark_pending_changelog(afr_private_t *priv, unsigned char *pending, + dict_t *xattr, ia_type_t iat); + +void +afr_filter_xattrs(dict_t *xattr); + +/* + * Special value indicating we should use the "auto" quorum method instead of + * a fixed value (including zero to turn off quorum enforcement). + */ +#define AFR_QUORUM_AUTO INT_MAX int -afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this); +afr_fd_report_unstable_write(xlator_t *this, afr_local_t *local); + +gf_boolean_t +afr_fd_has_witnessed_unstable_write(xlator_t *this, inode_t *inode); + +void +afr_reply_wipe(struct afr_reply *reply); + +void +afr_replies_wipe(struct afr_reply *replies, int count); + +gf_boolean_t +afr_xattrs_are_equal(dict_t *dict1, dict_t *dict2); + +gf_boolean_t +afr_is_xattr_ignorable(char *key); int -afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this); +afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc); int -afr_blocking_lock (call_frame_t *frame, xlator_t *this); +afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc); int -afr_internal_lock_finish (call_frame_t *frame, xlator_t *this); +afr_get_split_brain_status(void *opaque); +int +afr_get_split_brain_status_cbk(int ret, call_frame_t *frame, void *opaque); -int pump_start (call_frame_t *frame, xlator_t *this); +int +afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this, + int spb_choice); +int +afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this, + call_frame_t *frame, int *spb_subvol); +int +afr_get_child_index_from_name(xlator_t *this, char *name); int -afr_fd_ctx_set (xlator_t *this, fd_t *fd); +afr_is_split_brain(call_frame_t *frame, xlator_t *this, inode_t *inode, + uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb); +int +afr_spb_choice_timeout_cancel(xlator_t *this, inode_t *inode); -uint64_t -afr_read_child (xlator_t *this, inode_t *inode); +int +afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque); -void -afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child); +gf_boolean_t +afr_get_need_heal(xlator_t *this); void -afr_build_parent_loc (loc_t *parent, loc_t *child); +afr_set_need_heal(xlator_t *this, afr_local_t *local); int -afr_up_children_count (int child_count, unsigned char *child_up); +afr_selfheal_data_open(xlator_t *this, inode_t *inode, fd_t **fd); int -afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); - -ino64_t -afr_itransform (ino64_t ino, int child_count, int child_index); +afr_get_msg_id(char *op_type); int -afr_deitransform (ino64_t ino, int child_count); +afr_set_in_flight_sb_status(xlator_t *this, call_frame_t *frame, + inode_t *inode); + +int32_t +afr_quorum_errno(afr_private_t *priv); +gf_boolean_t +afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv, + int32_t *op_errno); void -afr_local_cleanup (afr_local_t *local, xlator_t *this); +afr_handle_inconsistent_fop(call_frame_t *frame, int32_t *op_ret, + int32_t *op_errno); -int -afr_frame_return (call_frame_t *frame); +void +afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata); +void +afr_process_post_writev(call_frame_t *frame, xlator_t *this); -uint64_t -afr_is_split_brain (xlator_t *this, inode_t *inode); +void +afr_writev_unwind(call_frame_t *frame, xlator_t *this); void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set); +afr_writev_copy_outvars(call_frame_t *src_frame, call_frame_t *dst_frame); + +void +afr_update_uninodelk(afr_local_t *local, afr_internal_lock_t *int_lock, + int32_t child_index); +afr_fd_ctx_t * +__afr_fd_ctx_get(fd_t *fd, xlator_t *this); + +gf_boolean_t +afr_is_inode_refresh_reqd(inode_t *inode, xlator_t *this, int event_gen1, + int event_gen2); int -afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags); +afr_serialize_xattrs_with_delimiter(call_frame_t *frame, xlator_t *this, + char *buf, const char *default_str, + int32_t *serz_len, char delimiter); +gf_boolean_t +afr_is_symmetric_error(call_frame_t *frame, xlator_t *this); -void -afr_set_opendir_done (xlator_t *this, inode_t *inode); +int +__afr_inode_ctx_get(xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx); uint64_t -afr_is_opendir_done (xlator_t *this, inode_t *inode); +afr_write_subvol_get(call_frame_t *frame, xlator_t *this); -void -afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this); - -int -afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); - -int -afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd); - -#define AFR_STACK_UNWIND(fop, frame, params ...) \ - do { \ - afr_local_t *__local = NULL; \ - xlator_t *__this = NULL; \ - if (frame) { \ - __local = frame->local; \ - __this = frame->this; \ - frame->local = NULL; \ - } \ - STACK_UNWIND_STRICT (fop, frame, params); \ - afr_local_cleanup (__local, __this); \ - GF_FREE (__local); \ - } while (0); - -#define AFR_STACK_DESTROY(frame) \ - do { \ - afr_local_t *__local = NULL; \ - xlator_t *__this = NULL; \ - __local = frame->local; \ - __this = frame->this; \ - frame->local = NULL; \ - STACK_DESTROY (frame->root); \ - afr_local_cleanup (__local, __this); \ - GF_FREE (__local); \ - } while (0); +int +afr_write_subvol_set(call_frame_t *frame, xlator_t *this); -/* allocate and return a string that is the basename of argument */ -static inline char * -AFR_BASENAME (const char *str) -{ - char *__tmp_str = NULL; - char *__basename_str = NULL; - __tmp_str = gf_strdup (str); - __basename_str = gf_strdup (basename (__tmp_str)); - GF_FREE (__tmp_str); - return __basename_str; -} +int +afr_write_subvol_reset(call_frame_t *frame, xlator_t *this); -/* initialize local_t */ -static inline int -AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv) -{ - int child_up_count = 0; +int +afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode); - local->child_up = GF_CALLOC (sizeof (*local->child_up), - priv->child_count, - gf_afr_mt_char); - if (!local->child_up) { - return -ENOMEM; - } +int +afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop); - memcpy (local->child_up, priv->child_up, - sizeof (*local->child_up) * priv->child_count); +int +afr_ta_post_op_lock(xlator_t *this, loc_t *loc); - child_up_count = afr_up_children_count (priv->child_count, local->child_up); +int +afr_ta_post_op_unlock(xlator_t *this, loc_t *loc); - if (priv->optimistic_change_log && child_up_count == priv->child_count) - local->optimistic_change_log = 1; +gf_boolean_t +afr_is_pending_set(xlator_t *this, dict_t *xdata, int type); - local->call_count = afr_up_children_count (priv->child_count, local->child_up); - if (local->call_count == 0) - return -ENOTCONN; +int +__afr_get_up_children_count(afr_private_t *priv); - local->transaction.erase_pending = 1; +call_frame_t * +afr_ta_frame_create(xlator_t *this); - local->op_ret = -1; - local->op_errno = EUCLEAN; +gf_boolean_t +afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local); - local->internal_lock.lock_op_ret = -1; - local->internal_lock.lock_op_errno = EUCLEAN; +void +afr_ta_lock_release_synctask(xlator_t *this); +void +afr_ta_locked_priv_invalidate(afr_private_t *priv); - return 0; -} +gf_boolean_t +afr_lookup_has_quorum(call_frame_t *frame, + const unsigned int up_children_count); +void +afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this); -/** - * first_up_child - return the index of the first child that is up - */ +void +afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this); -static inline int -afr_first_up_child (afr_private_t *priv) -{ - xlator_t ** children = NULL; - int ret = -1; - int i = 0; - - LOCK (&priv->lock); - { - children = priv->children; - for (i = 0; i < priv->child_count; i++) { - if (priv->child_up[i]) { - ret = i; - break; - } - } - } - UNLOCK (&priv->lock); - - return ret; -} +gf_boolean_t +afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, + int child); +void +afr_selfheal_childup(xlator_t *this, afr_private_t *priv); -static inline int -afr_transaction_local_init (afr_local_t *local, afr_private_t *priv) -{ - int i; - - local->first_up_child = afr_first_up_child (priv); - - local->child_errno = GF_CALLOC (sizeof (*local->child_errno), - priv->child_count, - gf_afr_mt_int32_t); - if (!local->child_errno) { - return -ENOMEM; - } - - local->pending = GF_CALLOC (sizeof (*local->pending), - priv->child_count, - gf_afr_mt_int32_t); - - if (!local->pending) { - return -ENOMEM; - } - - for (i = 0; i < priv->child_count; i++) { - local->pending[i] = GF_CALLOC (sizeof (*local->pending[i]), - 3, /* data + metadata + entry */ - gf_afr_mt_int32_t); - if (!local->pending[i]) - return -ENOMEM; - } - - local->internal_lock.inode_locked_nodes = - GF_CALLOC (sizeof (*local->internal_lock.inode_locked_nodes), - priv->child_count, - gf_afr_mt_char); - - local->internal_lock.entry_locked_nodes = - GF_CALLOC (sizeof (*local->internal_lock.entry_locked_nodes), - priv->child_count, - gf_afr_mt_char); - - local->internal_lock.locked_nodes = - GF_CALLOC (sizeof (*local->internal_lock.locked_nodes), - priv->child_count, - gf_afr_mt_char); - - local->internal_lock.lower_locked_nodes - = GF_CALLOC (sizeof (*local->internal_lock.lower_locked_nodes), - priv->child_count, - gf_afr_mt_char); - - local->transaction.child_errno = GF_CALLOC (sizeof (*local->transaction.child_errno), - priv->child_count, - gf_afr_mt_int32_t); - - local->internal_lock.transaction_lk_type = AFR_TRANSACTION_LK; - - return 0; -} +gf_boolean_t +afr_is_lock_mode_mandatory(dict_t *xdata); -int32_t -afr_marker_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv ); +void +afr_dom_lock_release(call_frame_t *frame); + +void +afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, + unsigned char *replies); +gf_boolean_t +afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name, + pid_t pid); #endif /* __AFR_H__ */ diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c deleted file mode 100644 index 5b1a4f62316..00000000000 --- a/xlators/cluster/afr/src/pump.c +++ /dev/null @@ -1,2536 +0,0 @@ -/* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#include <unistd.h> -#include <sys/time.h> -#include <stdlib.h> - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "afr-common.c" -#include "defaults.c" - -static int -pump_mark_start_pending (xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - priv = this->private; - pump_priv = priv->pump_private; - - pump_priv->pump_start_pending = 1; - - return 0; -} - -static int -is_pump_start_pending (xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - priv = this->private; - pump_priv = priv->pump_private; - - return (pump_priv->pump_start_pending); -} - -static int -pump_remove_start_pending (xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - priv = this->private; - pump_priv = priv->pump_private; - - pump_priv->pump_start_pending = 0; - - return 0; -} - -static pump_state_t -pump_get_state () -{ - xlator_t *this = NULL; - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - pump_state_t ret; - - this = THIS; - priv = this->private; - pump_priv = priv->pump_private; - - LOCK (&pump_priv->pump_state_lock); - { - ret = pump_priv->pump_state; - } - UNLOCK (&pump_priv->pump_state_lock); - - return ret; -} - -int -pump_change_state (xlator_t *this, pump_state_t state) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - pump_state_t state_old; - pump_state_t state_new; - - - priv = this->private; - pump_priv = priv->pump_private; - - GF_ASSERT (pump_priv); - - LOCK (&pump_priv->pump_state_lock); - { - state_old = pump_priv->pump_state; - state_new = state; - - pump_priv->pump_state = state; - - } - UNLOCK (&pump_priv->pump_state_lock); - - gf_log (this->name, GF_LOG_DEBUG, - "Pump changing state from %d to %d", - state_old, - state_new); - - return 0; -} - -static int -pump_set_resume_path (xlator_t *this, const char *path) -{ - int ret = 0; - - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - priv = this->private; - pump_priv = priv->pump_private; - - GF_ASSERT (pump_priv); - - LOCK (&pump_priv->resume_path_lock); - { - pump_priv->resume_path = strdup (path); - if (!pump_priv->resume_path) - ret = -1; - } - UNLOCK (&pump_priv->resume_path_lock); - - return ret; -} - -static void -build_child_loc (loc_t *parent, loc_t *child, char *path, char *name) -{ - child->path = path; - child->name = name; - - child->parent = inode_ref (parent->inode); - child->inode = inode_new (parent->inode->table); -} - -static char * -build_file_path (loc_t *loc, gf_dirent_t *entry) -{ - xlator_t *this = NULL; - char *file_path = NULL; - int pathlen = 0; - int total_size = 0; - - this = THIS; - - pathlen = STRLEN_0 (loc->path); - - if (IS_ROOT_PATH (loc->path)) { - total_size = pathlen + entry->d_len; - file_path = GF_CALLOC (1, total_size, gf_afr_mt_char); - if (!file_path) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - return NULL; - } - - gf_log (this->name, GF_LOG_TRACE, - "constructing file path of size=%d" - "pathlen=%d, d_len=%d", - total_size, pathlen, - entry->d_len); - - snprintf(file_path, total_size, "%s%s", loc->path, entry->d_name); - - } else { - total_size = pathlen + entry->d_len + 1; /* for the extra '/' in the path */ - file_path = GF_CALLOC (1, total_size + 1, gf_afr_mt_char); - if (!file_path) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - return NULL; - } - - gf_log (this->name, GF_LOG_TRACE, - "constructing file path of size=%d" - "pathlen=%d, d_len=%d", - total_size, pathlen, - entry->d_len); - - snprintf(file_path, total_size, "%s/%s", loc->path, entry->d_name); - } - - gf_log (this->name, GF_LOG_TRACE, - "path=%s and d_name=%s", loc->path, entry->d_name); - gf_log (this->name, GF_LOG_TRACE, - "constructed file_path=%s of size=%d", file_path, total_size); - - return file_path; -} - -static int -pump_save_path (xlator_t *this, const char *path) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - pump_state_t state; - dict_t *dict = NULL; - loc_t loc; - int dict_ret = 0; - int ret = -1; - - state = pump_get_state (); - if (state == PUMP_STATE_RESUME) - return 0; - - priv = this->private; - pump_priv = priv->pump_private; - - GF_ASSERT (priv->root_inode); - - build_root_loc (priv->root_inode, &loc); - - dict = dict_new (); - dict_ret = dict_set_str (dict, PUMP_PATH, (char *)path); - - ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0); - - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "setxattr failed - could not save path=%s", path); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "setxattr succeeded - saved path=%s", path); - gf_log (this->name, GF_LOG_DEBUG, - "Saving path for status info"); - } - - dict_unref (dict); - - return 0; -} - -static int -pump_check_and_update_status (xlator_t *this) -{ - pump_state_t state; - int ret = -1; - - state = pump_get_state (); - - switch (state) { - - case PUMP_STATE_RESUME: - case PUMP_STATE_RUNNING: - { - ret = 0; - break; - } - case PUMP_STATE_PAUSE: - { - ret = -1; - break; - } - case PUMP_STATE_ABORT: - { - pump_save_path (this, "/"); - ret = -1; - break; - } - default: - { - gf_log (this->name, GF_LOG_DEBUG, - "Unknown pump state"); - ret = -1; - break; - } - - } - - return ret; -} - -static const char * -pump_get_resume_path (xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - const char *resume_path = NULL; - - priv = this->private; - pump_priv = priv->pump_private; - - resume_path = pump_priv->resume_path; - - return resume_path; -} - -static int -pump_update_resume_state (xlator_t *this, const char *path) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - pump_state_t state; - const char *resume_path = NULL; - - priv = this->private; - pump_priv = priv->pump_private; - - state = pump_get_state (); - - if (state == PUMP_STATE_RESUME) { - resume_path = pump_get_resume_path (this); - if (strcmp (resume_path, "/") == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Reached the resume path (/). Proceeding to change state" - " to running"); - pump_change_state (this, PUMP_STATE_RUNNING); - } else if (strcmp (resume_path, path) == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Reached the resume path. Proceeding to change state" - " to running"); - pump_change_state (this, PUMP_STATE_RUNNING); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "Not yet hit the resume path:res-path=%s,path=%s", - resume_path, path); - } - } - - return 0; -} - -static gf_boolean_t -is_pump_traversal_allowed (xlator_t *this, const char *path) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - pump_state_t state; - const char *resume_path = NULL; - gf_boolean_t ret = _gf_true; - - priv = this->private; - pump_priv = priv->pump_private; - - state = pump_get_state (); - - if (state == PUMP_STATE_RESUME) { - resume_path = pump_get_resume_path (this); - if (strstr (resume_path, path)) { - gf_log (this->name, GF_LOG_DEBUG, - "On the right path to resumption path"); - ret = _gf_true; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "Not the right path to resuming=> ignoring traverse"); - ret = _gf_false; - } - } - - return ret; -} - -static int -pump_save_file_stats (xlator_t *this, const char *path) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - priv = this->private; - pump_priv = priv->pump_private; - - LOCK (&pump_priv->resume_path_lock); - { - pump_priv->number_files_pumped++; - - strncpy (pump_priv->current_file, path, - PATH_MAX); - } - UNLOCK (&pump_priv->resume_path_lock); - - return 0; -} - -static int -gf_pump_traverse_directory (loc_t *loc) -{ - xlator_t *this = NULL; - afr_private_t *priv = NULL; - fd_t *fd = NULL; - - off_t offset = 0; - loc_t entry_loc; - gf_dirent_t *entry = NULL; - gf_dirent_t *tmp = NULL; - gf_dirent_t entries; - - struct iatt iatt, parent; - dict_t *xattr_rsp; - - int source = 0; - - char *file_path = NULL; - int ret = 0; - - INIT_LIST_HEAD (&entries.list); - this = THIS; - priv = this->private; - - GF_ASSERT (loc->inode); - - fd = fd_create (loc->inode, PUMP_PID); - if (!fd) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to create fd for %s", loc->path); - goto out; - } - - ret = syncop_opendir (priv->children[source], loc, fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "opendir failed on %s", loc->path); - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "pump opendir on %s returned=%d", - loc->path, ret); - - while (syncop_readdirp (priv->children[source], fd, 131072, offset, &entries)) { - - if (list_empty (&entries.list)) { - gf_log (this->name, GF_LOG_TRACE, - "no more entries in directory"); - goto out; - } - - list_for_each_entry_safe (entry, tmp, &entries.list, list) { - gf_log (this->name, GF_LOG_DEBUG, - "found readdir entry=%s", entry->d_name); - - file_path = build_file_path (loc, entry); - if (!file_path) { - gf_log (this->name, GF_LOG_DEBUG, - "file path construction failed"); - goto out; - } - - build_child_loc (loc, &entry_loc, file_path, entry->d_name); - - if (!IS_ENTRY_CWD (entry->d_name) && - !IS_ENTRY_PARENT (entry->d_name)) { - - ret = syncop_lookup (this, &entry_loc, NULL, - &iatt, &xattr_rsp, &parent); - - entry_loc.ino = iatt.ia_ino; - entry_loc.inode->ino = iatt.ia_ino; - memcpy (entry_loc.inode->gfid, iatt.ia_gfid, 16); - - gf_log (this->name, GF_LOG_DEBUG, - "lookup %s => %"PRId64, - entry_loc.path, - iatt.ia_ino); - - ret = syncop_lookup (this, &entry_loc, NULL, - &iatt, &xattr_rsp, &parent); - - - gf_log (this->name, GF_LOG_DEBUG, - "second lookup ret=%d: %s => %"PRId64, - ret, - entry_loc.path, - iatt.ia_ino); - - pump_update_resume_state (this, entry_loc.path); - - pump_save_path (this, entry_loc.path); - pump_save_file_stats (this, entry_loc.path); - - ret = pump_check_and_update_status (this); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Pump beginning to exit out"); - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "type of file=%d, IFDIR=%d", - iatt.ia_type, IA_IFDIR); - - if (IA_ISDIR (iatt.ia_type)) { - if (is_pump_traversal_allowed (this, entry_loc.path)) { - gf_log (this->name, GF_LOG_TRACE, - "entering dir=%s", - entry->d_name); - gf_pump_traverse_directory (&entry_loc); - } - } - } - offset = entry->d_off; - loc_wipe (&entry_loc); - } - - gf_dirent_free (&entries); - gf_log (this->name, GF_LOG_TRACE, - "offset incremented to %d", - (int32_t ) offset); - - } - -out: - return 0; - -} - -void -build_root_loc (inode_t *inode, loc_t *loc) -{ - loc->path = "/"; - loc->name = ""; - loc->inode = inode; - loc->ino = 1; - loc->inode->ino = 1; - memset (loc->inode->gfid, 0, 16); - loc->inode->gfid[15] = 1; - -} - -static int -pump_update_resume_path (xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - const char *resume_path = NULL; - - priv = this->private; - pump_priv = priv->pump_private; - - resume_path = pump_get_resume_path (this); - - if (resume_path) { - gf_log (this->name, GF_LOG_DEBUG, - "Found a path to resume from: %s", - resume_path); - - }else { - gf_log (this->name, GF_LOG_DEBUG, - "Did not find a path=> setting to '/'"); - pump_set_resume_path (this, "/"); - } - - pump_change_state (this, PUMP_STATE_RESUME); - - return 0; -} - -static int -pump_complete_migration (xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - dict_t *dict = NULL; - pump_state_t state; - loc_t loc; - int dict_ret = 0; - int ret = -1; - - priv = this->private; - pump_priv = priv->pump_private; - - GF_ASSERT (priv->root_inode); - - build_root_loc (priv->root_inode, &loc); - - dict = dict_new (); - - state = pump_get_state (); - if (state == PUMP_STATE_RUNNING) { - gf_log (this->name, GF_LOG_DEBUG, - "Pump finished pumping"); - - pump_priv->pump_finished = _gf_true; - - dict_ret = dict_set_str (dict, PUMP_SOURCE_COMPLETE, "jargon"); - - ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "setxattr failed - while notifying source complete"); - } - dict_ret = dict_set_str (dict, PUMP_SINK_COMPLETE, "jargon"); - - ret = syncop_setxattr (PUMP_SINK_CHILD (this), &loc, dict, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "setxattr failed - while notifying sink complete"); - } - - pump_save_path (this, "/"); - } - - return 0; -} - -static int -pump_set_root_gfid (dict_t *dict) -{ - uuid_t gfid; - int ret = 0; - - memset (gfid, 0, 16); - gfid[15] = 1; - - ret = afr_set_dict_gfid (dict, gfid); - - return ret; -} - -static int -pump_lookup_sink (loc_t *loc) -{ - xlator_t *this = NULL; - struct iatt iatt, parent; - dict_t *xattr_rsp; - dict_t *xattr_req = NULL; - int ret = 0; - - this = THIS; - - xattr_req = dict_new (); - - ret = pump_set_root_gfid (xattr_req); - if (ret) - goto out; - - ret = syncop_lookup (PUMP_SINK_CHILD (this), loc, - xattr_req, &iatt, &xattr_rsp, &parent); - - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "Lookup on sink child failed"); - goto out; - } - -out: - if (xattr_req) - dict_unref (xattr_req); - - return ret; -} - -static int -pump_task (void *data) -{ - xlator_t *this = NULL; - afr_private_t *priv = NULL; - - - loc_t loc; - struct iatt iatt, parent; - dict_t *xattr_rsp = NULL; - dict_t *xattr_req = NULL; - - int ret = -1; - - this = THIS; - priv = this->private; - - GF_ASSERT (priv->root_inode); - - build_root_loc (priv->root_inode, &loc); - xattr_req = dict_new (); - if (!xattr_req) { - gf_log (this->name, GF_LOG_DEBUG, - "Out of memory"); - ret = -1; - goto out; - } - - pump_set_root_gfid (xattr_req); - ret = syncop_lookup (this, &loc, xattr_req, - &iatt, &xattr_rsp, &parent); - - gf_log (this->name, GF_LOG_TRACE, - "lookup: ino=%"PRId64", path=%s", - loc.ino, - loc.path); - - ret = pump_check_and_update_status (this); - if (ret < 0) { - goto out; - } - - pump_update_resume_path (this); - - pump_set_root_gfid (xattr_req); - ret = pump_lookup_sink (&loc); - if (ret) { - pump_update_resume_path (this); - goto out; - } - - gf_pump_traverse_directory (&loc); - - pump_complete_migration (this); -out: - if (xattr_req) - dict_unref (xattr_req); - - return 0; -} - - -static int -pump_task_completion (int ret, void *data) -{ - xlator_t *this = NULL; - call_frame_t *frame = NULL; - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - this = THIS; - - frame = (call_frame_t *) data; - - priv = this->private; - pump_priv = priv->pump_private; - - inode_unref (priv->root_inode); - - gf_log (this->name, GF_LOG_DEBUG, - "Pump xlator exiting"); - return 0; -} - -int -pump_start (call_frame_t *pump_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - int ret = -1; - - priv = this->private; - pump_priv = priv->pump_private; - - if (!pump_frame->root->lk_owner) - pump_frame->root->lk_owner = PUMP_LK_OWNER; - - ret = synctask_new (pump_priv->env, pump_task, - pump_task_completion, - pump_frame); - if (ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "starting pump failed"); - pump_change_state (this, PUMP_STATE_ABORT); - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "setting pump as started"); - - priv->use_afr_in_pump = 1; -out: - return ret; -} - -static int -pump_start_synctask (xlator_t *this) -{ - call_frame_t *frame = NULL; - int ret = 0; - - frame = create_frame (this, this->ctx->pool); - if (!frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - ret = -1; - goto out; - } - - pump_change_state (this, PUMP_STATE_RUNNING); - - ret = pump_start (frame, this); - -out: - return ret; -} - -int32_t -pump_cmd_start_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) - -{ - call_frame_t *prev = NULL; - afr_local_t *local = NULL; - int ret = 0; - - local = frame->local; - - if (op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Could not initiate destination " - "brick connect"); - ret = op_ret; - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Successfully initiated destination " - "brick connect"); - - /* send the PARENT_UP as pump is ready now */ - prev = cookie; - if (prev && prev->this) - prev->this->notify (prev->this, GF_EVENT_PARENT_UP, this); - - pump_mark_start_pending (this); - -out: - local->op_ret = ret; - pump_command_reply (frame, this); - - return 0; -} - -static int -pump_initiate_sink_connect (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - dict_t *dict = NULL; - char *dst_brick = NULL; - loc_t loc; - - int ret = 0; - - priv = this->private; - local = frame->local; - - GF_ASSERT (priv->root_inode); - - build_root_loc (priv->root_inode, &loc); - - ret = dict_get_str (local->dict, PUMP_CMD_START, &dst_brick); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Could not get destination brick value"); - goto out; - } - - dict = dict_new (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - ret = -1; - goto out; - } - - GF_ASSERT (dst_brick); - gf_log (this->name, GF_LOG_DEBUG, - "Got destination brick as %s", dst_brick); - - ret = dict_set_str (dict, CLIENT_CMD_CONNECT, dst_brick); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Could not inititiate destination brick " - "connect"); - goto out; - } - - STACK_WIND (frame, - pump_cmd_start_setxattr_cbk, - PUMP_SINK_CHILD(this), - PUMP_SINK_CHILD(this)->fops->setxattr, - &loc, - dict, - 0); - - ret = 0; - - dict_unref (dict); -out: - return ret; -} - -static int -is_pump_aborted (xlator_t *this) -{ - pump_state_t state; - - state = pump_get_state (); - - return ((state == PUMP_STATE_ABORT)); -} - -int32_t -pump_cmd_start_getxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *dict) -{ - afr_local_t *local = NULL; - char *path = NULL; - - pump_state_t state; - int ret = 0; - int need_unwind = 0; - int dict_ret = -1; - - local = frame->local; - - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "getxattr failed - changing pump " - "state to RUNNING with '/'"); - path = "/"; - ret = op_ret; - } else { - gf_log (this->name, GF_LOG_TRACE, - "getxattr succeeded"); - - dict_ret = dict_get_str (dict, PUMP_PATH, &path); - if (dict_ret < 0) - path = "/"; - } - - state = pump_get_state (); - if ((state == PUMP_STATE_RUNNING) || - (state == PUMP_STATE_RESUME)) { - gf_log (this->name, GF_LOG_ERROR, - "Pump is already started"); - ret = -1; - goto out; - } - - pump_set_resume_path (this, path); - - if (is_pump_aborted (this)) - /* We're re-starting pump afresh */ - ret = pump_initiate_sink_connect (frame, this); - else { - /* We're re-starting pump from a previous - pause */ - gf_log (this->name, GF_LOG_DEBUG, - "about to start synctask"); - ret = pump_start_synctask (this); - need_unwind = 1; - } - -out: - if ((ret < 0) || (need_unwind == 1)) { - local->op_ret = ret; - pump_command_reply (frame, this); - } - return 0; -} - -int -pump_execute_status (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - - uint64_t number_files = 0; - - char filename[PATH_MAX]; - char *dict_str = NULL; - - int32_t op_ret = 0; - int32_t op_errno = 0; - - dict_t *dict = NULL; - int ret = -1; - - priv = this->private; - pump_priv = priv->pump_private; - - LOCK (&pump_priv->resume_path_lock); - { - number_files = pump_priv->number_files_pumped; - strncpy (filename, pump_priv->current_file, PATH_MAX); - } - UNLOCK (&pump_priv->resume_path_lock); - - dict_str = GF_CALLOC (1, PATH_MAX + 256, gf_afr_mt_char); - if (!dict_str) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - if (pump_priv->pump_finished) { - snprintf (dict_str, PATH_MAX + 256, "Number of files migrated = %"PRIu64" Migration complete ", - number_files); - } else { - snprintf (dict_str, PATH_MAX + 256, "Number of files migrated = %"PRIu64" Current file= %s ", - number_files, filename); - } - - dict = dict_new (); - - ret = dict_set_str (dict, PUMP_CMD_STATUS, dict_str); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "dict_set_str returned negative value"); - } - - op_ret = 0; - -out: - - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); - - dict_unref (dict); - GF_FREE (dict_str); - - return 0; -} - -int -pump_execute_pause (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - pump_change_state (this, PUMP_STATE_PAUSE); - - local->op_ret = 0; - pump_command_reply (frame, this); - - return 0; -} - -int -pump_execute_start (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int ret = 0; - loc_t loc; - - priv = this->private; - local = frame->local; - - if (!priv->root_inode) { - gf_log (this->name, GF_LOG_ERROR, - "Pump xlator cannot be started without an initial " - "lookup"); - ret = -1; - goto out; - } - - GF_ASSERT (priv->root_inode); - - build_root_loc (priv->root_inode, &loc); - - STACK_WIND (frame, - pump_cmd_start_getxattr_cbk, - PUMP_SOURCE_CHILD(this), - PUMP_SOURCE_CHILD(this)->fops->getxattr, - &loc, - PUMP_PATH); - - ret = 0; - -out: - if (ret < 0) { - local->op_ret = ret; - pump_command_reply (frame, this); - } - - return 0; -} - -int -pump_execute_abort (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - afr_local_t *local = NULL; - - priv = this->private; - pump_priv = priv->pump_private; - local = frame->local; - - pump_change_state (this, PUMP_STATE_ABORT); - - LOCK (&pump_priv->resume_path_lock); - { - pump_priv->number_files_pumped = 0; - pump_priv->current_file[0] = '\0'; - } - UNLOCK (&pump_priv->resume_path_lock); - - local->op_ret = 0; - pump_command_reply (frame, this); - - return 0; -} - -gf_boolean_t -pump_command_status (xlator_t *this, dict_t *dict) -{ - char *cmd = NULL; - int dict_ret = -1; - int ret = _gf_true; - - dict_ret = dict_get_str (dict, PUMP_CMD_STATUS, &cmd); - if (dict_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Not a pump status command"); - ret = _gf_false; - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Hit a pump command - status"); - ret = _gf_true; - -out: - return ret; - -} - -gf_boolean_t -pump_command_pause (xlator_t *this, dict_t *dict) -{ - char *cmd = NULL; - int dict_ret = -1; - int ret = _gf_true; - - dict_ret = dict_get_str (dict, PUMP_CMD_PAUSE, &cmd); - if (dict_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Not a pump pause command"); - ret = _gf_false; - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Hit a pump command - pause"); - ret = _gf_true; - -out: - return ret; - -} - -gf_boolean_t -pump_command_abort (xlator_t *this, dict_t *dict) -{ - char *cmd = NULL; - int dict_ret = -1; - int ret = _gf_true; - - dict_ret = dict_get_str (dict, PUMP_CMD_ABORT, &cmd); - if (dict_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Not a pump abort command"); - ret = _gf_false; - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Hit a pump command - abort"); - ret = _gf_true; - -out: - return ret; - -} - -gf_boolean_t -pump_command_start (xlator_t *this, dict_t *dict) -{ - char *cmd = NULL; - int dict_ret = -1; - int ret = _gf_true; - - dict_ret = dict_get_str (dict, PUMP_CMD_START, &cmd); - if (dict_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Not a pump start command"); - ret = _gf_false; - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Hit a pump command - start"); - ret = _gf_true; - -out: - return ret; - -} - -struct _xattr_key { - char *key; - struct list_head list; -}; - -static void -__gather_xattr_keys (dict_t *dict, char *key, data_t *value, - void *data) -{ - struct list_head * list = data; - struct _xattr_key * xkey = NULL; - - if (!strncmp (key, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { - - xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); - if (!xkey) - return; - - xkey->key = key; - INIT_LIST_HEAD (&xkey->list); - - list_add_tail (&xkey->list, list); - } -} - -static void -__filter_xattrs (dict_t *dict) -{ - struct list_head keys; - - struct _xattr_key *key; - struct _xattr_key *tmp; - - INIT_LIST_HEAD (&keys); - - dict_foreach (dict, __gather_xattr_keys, - (void *) &keys); - - list_for_each_entry_safe (key, tmp, &keys, list) { - dict_del (dict, key->key); - - list_del_init (&key->list); - - GF_FREE (key); - } -} - -int32_t -pump_getxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; - - priv = this->private; - children = priv->children; - - local = frame->local; - - read_child = (long) cookie; - - if (op_ret == -1) { - retry: - last_tried = local->cont.getxattr.last_tried; - - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.getxattr.last_tried; - - if (this_try == read_child) { - goto retry; - } - - unwind = 0; - STACK_WIND_COOKIE (frame, pump_getxattr_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->getxattr, - &local->loc, - local->cont.getxattr.name); - } - -out: - if (unwind) { - if (op_ret >= 0 && dict) - __filter_xattrs (dict); - - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); - } - - return 0; -} - -int32_t -pump_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) -{ - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t * local = NULL; - - int read_child = -1; - - int32_t op_ret = -1; - int32_t op_errno = 0; - - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; - - if (name) { - if (!strncmp (name, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { - - op_errno = ENODATA; - goto out; - } - - if (!strcmp (name, PUMP_CMD_STATUS)) { - gf_log (this->name, GF_LOG_DEBUG, - "Hit pump command - status"); - pump_execute_status (frame, this); - op_ret = 0; - goto out; - } - } - - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_getxattr_cbk, - FIRST_CHILD (this), - (FIRST_CHILD (this))->fops->getxattr, - loc, name); - return 0; - } - - read_child = afr_read_child (this, loc->inode); - - if (read_child >= 0) { - call_child = read_child; - - local->cont.getxattr.last_tried = -1; - } else { - call_child = afr_first_up_child (priv); - - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } - - local->cont.getxattr.last_tried = call_child; - } - - loc_copy (&local->loc, loc); - if (name) - local->cont.getxattr.name = gf_strdup (name); - - STACK_WIND_COOKIE (frame, pump_getxattr_cbk, - (void *) (long) call_child, - children[call_child], children[call_child]->fops->getxattr, - loc, name); - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, NULL); - } - return 0; -} - -static int -afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (setxattr, main_frame, - local->op_ret, local->op_errno) - } - return 0; -} - -static int -afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->child_count) { - need_unwind = 1; - } - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; -} - -static int -afr_setxattr_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, - local->cont.setxattr.dict, - local->cont.setxattr.flags); - - if (!--call_count) - break; - } - } - - return 0; -} - - -static int -afr_setxattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - -int32_t -pump_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, - op_ret, - op_errno); - return 0; -} - -int -pump_command_reply (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - if (local->op_ret < 0) - gf_log (this->name, GF_LOG_NORMAL, - "Command failed"); - else - gf_log (this->name, GF_LOG_NORMAL, - "Command succeeded"); - - dict_unref (local->dict); - - AFR_STACK_UNWIND (setxattr, - frame, - local->op_ret, - local->op_errno); - - return 0; -} - -int -pump_parse_command (call_frame_t *frame, xlator_t *this, - afr_local_t *local, dict_t *dict) -{ - - int ret = -1; - - if (pump_command_start (this, dict)) { - frame->local = local; - local->dict = dict_ref (dict); - ret = pump_execute_start (frame, this); - - } else if (pump_command_pause (this, dict)) { - frame->local = local; - local->dict = dict_ref (dict); - ret = pump_execute_pause (frame, this); - - } else if (pump_command_abort (this, dict)) { - frame->local = local; - local->dict = dict_ref (dict); - ret = pump_execute_abort (frame, this); - } - return ret; -} - -int -pump_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - - int ret = -1; - - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - ret = pump_parse_command (frame, this, - local, dict); - if (ret >= 0) { - op_ret = 0; - goto out; - } - - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_setxattr_cbk, - FIRST_CHILD (this), - (FIRST_CHILD (this))->fops->setxattr, - loc, dict, flags); - return 0; - } - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - transaction_frame->local = local; - - local->op_ret = -1; - - local->cont.setxattr.dict = dict_ref (dict); - local->cont.setxattr.flags = flags; - - local->transaction.fop = afr_setxattr_wind; - local->transaction.done = afr_setxattr_done; - local->transaction.unwind = afr_setxattr_unwind; - - loc_copy (&local->loc, loc); - - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; - - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno); - } - - return 0; -} - -/* Defaults */ -static int32_t -pump_lookup (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *xattr_req) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_lookup_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, - loc, - xattr_req); - return 0; - } - - afr_lookup (frame, this, loc, xattr_req); - return 0; -} - - -static int32_t -pump_truncate (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - off_t offset) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_truncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, - loc, - offset); - return 0; - } - - afr_truncate (frame, this, loc, offset); - return 0; -} - - -static int32_t -pump_ftruncate (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - off_t offset) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_ftruncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, - fd, - offset); - return 0; - } - - afr_ftruncate (frame, this, fd, offset); - return 0; -} - - - - -int -pump_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *parms) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_mknod_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mknod, - loc, mode, rdev, parms); - return 0; - } - afr_mknod (frame, this, loc, mode, rdev, parms); - return 0; - -} - - - -int -pump_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_mkdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mkdir, - loc, mode, params); - return 0; - } - afr_mkdir (frame, this, loc, mode, params); - return 0; - -} - - -static int32_t -pump_unlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_unlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, - loc); - return 0; - } - afr_unlink (frame, this, loc); - return 0; - -} - - -static int -pump_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags) -{ - afr_private_t *priv = NULL; - - priv = this->private; - - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_rmdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rmdir, - loc, flags); - return 0; - } - - afr_rmdir (frame, this, loc, flags); - return 0; - -} - - - -int -pump_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, dict_t *params) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_symlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->symlink, - linkpath, loc, params); - return 0; - } - afr_symlink (frame, this, linkpath, loc, params); - return 0; - -} - - -static int32_t -pump_rename (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_rename_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rename, - oldloc, newloc); - return 0; - } - afr_rename (frame, this, oldloc, newloc); - return 0; - -} - - -static int32_t -pump_link (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_link_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->link, - oldloc, newloc); - return 0; - } - afr_link (frame, this, oldloc, newloc); - return 0; - -} - - -static int32_t -pump_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_create_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, - loc, flags, mode, fd, params); - return 0; - } - afr_create (frame, this, loc, flags, mode, fd, params); - return 0; - -} - - -static int32_t -pump_open (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, fd_t *fd, - int32_t wbflags) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_open_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, - loc, flags, fd, wbflags); - return 0; - } - afr_open (frame, this, loc, flags, fd, wbflags); - return 0; - -} - - -static int32_t -pump_writev (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iovec *vector, - int32_t count, - off_t off, - struct iobref *iobref) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_writev_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, - fd, - vector, - count, - off, - iobref); - return 0; - } - afr_writev (frame, this, fd, vector, count, off, iobref); - return 0; - -} - - -static int32_t -pump_flush (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_flush_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, - fd); - return 0; - } - afr_flush (frame, this, fd); - return 0; - -} - - -static int32_t -pump_fsync (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_fsync_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsync, - fd, - flags); - return 0; - } - afr_fsync (frame, this, fd, flags); - return 0; - -} - - -static int32_t -pump_opendir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, fd_t *fd) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_opendir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->opendir, - loc, fd); - return 0; - } - afr_opendir (frame, this, loc, fd); - return 0; - -} - - -static int32_t -pump_fsyncdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_fsyncdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsyncdir, - fd, - flags); - return 0; - } - afr_fsyncdir (frame, this, fd, flags); - return 0; - -} - - -static int32_t -pump_xattrop (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - gf_xattrop_flags_t flags, - dict_t *dict) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_xattrop_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->xattrop, - loc, - flags, - dict); - return 0; - } - afr_xattrop (frame, this, loc, flags, dict); - return 0; - -} - -static int32_t -pump_fxattrop (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - gf_xattrop_flags_t flags, - dict_t *dict) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_fxattrop_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fxattrop, - fd, - flags, - dict); - return 0; - } - afr_fxattrop (frame, this, fd, flags, dict); - return 0; - -} - - -static int32_t -pump_removexattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_removexattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->removexattr, - loc, - name); - return 0; - } - afr_removexattr (frame, this, loc, name); - return 0; - -} - - - -static int32_t -pump_readdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t off) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_readdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdir, - fd, size, off); - return 0; - } - afr_readdir (frame, this, fd, size, off); - return 0; - -} - - -static int32_t -pump_readdirp (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t off) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_readdirp_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdirp, - fd, size, off); - return 0; - } - afr_readdirp (frame, this, fd, size, off); - return 0; - -} - - - -static int32_t -pump_releasedir (xlator_t *this, - fd_t *fd) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (priv->use_afr_in_pump) - afr_releasedir (this, fd); - return 0; - -} - -static int32_t -pump_release (xlator_t *this, - fd_t *fd) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (priv->use_afr_in_pump) - afr_release (this, fd); - return 0; - -} - - -static int32_t -pump_setattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - struct iatt *stbuf, - int32_t valid) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_setattr_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->setattr, - loc, stbuf, valid); - return 0; - } - afr_setattr (frame, this, loc, stbuf, valid); - return 0; - -} - - -static int32_t -pump_fsetattr (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iatt *stbuf, - int32_t valid) -{ - afr_private_t *priv = NULL; - priv = this->private; - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, - default_fsetattr_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsetattr, - fd, stbuf, valid); - return 0; - } - afr_fsetattr (frame, this, fd, stbuf, valid); - return 0; - -} - - -/* End of defaults */ - - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1); - - if (ret != 0) { - gf_log(this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; -} - -static int -is_xlator_pump_sink (xlator_t *child) -{ - return (child == PUMP_SINK_CHILD(THIS)); -} - -static int -is_xlator_pump_source (xlator_t *child) -{ - return (child == PUMP_SOURCE_CHILD(THIS)); -} - -int32_t -notify (xlator_t *this, int32_t event, - void *data, ...) -{ - int ret = -1; - xlator_t *child_xl = NULL; - - child_xl = (xlator_t *) data; - - ret = afr_notify (this, event, data); - - switch (event) { - case GF_EVENT_CHILD_DOWN: - if (is_xlator_pump_source (child_xl)) - pump_change_state (this, PUMP_STATE_ABORT); - break; - - case GF_EVENT_CHILD_UP: - if (is_xlator_pump_sink (child_xl)) - if (is_pump_start_pending (this)) { - gf_log (this->name, GF_LOG_DEBUG, - "about to start synctask"); - ret = pump_start_synctask (this); - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "Could not start pump " - "synctask"); - else - pump_remove_start_pending (this); - } - } - - return ret; -} - -int32_t -init (xlator_t *this) -{ - afr_private_t * priv = NULL; - pump_private_t *pump_priv = NULL; - int child_count = 0; - xlator_list_t * trav = NULL; - int i = 0; - int ret = -1; - int op_errno = 0; - - int source_child = 0; - - if (!this->children) { - gf_log (this->name, GF_LOG_ERROR, - "pump translator needs a source and sink" - "subvolumes defined."); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "Volume is dangling."); - } - - ALLOC_OR_GOTO (this->private, afr_private_t, out); - - priv = this->private; - - priv->read_child = source_child; - priv->favorite_child = source_child; - priv->background_self_heal_count = 0; - - priv->data_self_heal = 1; - priv->metadata_self_heal = 1; - priv->entry_self_heal = 1; - - priv->data_self_heal_algorithm = ""; - - priv->data_self_heal_window_size = 16; - - priv->data_change_log = 1; - priv->metadata_change_log = 1; - priv->entry_change_log = 1; - priv->use_afr_in_pump = 1; - - /* Locking options */ - - /* Lock server count infact does not matter. Locks are held - on all subvolumes, in this case being the source - and the sink. - */ - - priv->data_lock_server_count = 2; - priv->metadata_lock_server_count = 2; - priv->entry_lock_server_count = 2; - - priv->strict_readdir = _gf_false; - - trav = this->children; - while (trav) { - child_count++; - trav = trav->next; - } - - priv->wait_count = 1; - - if (child_count != 2) { - gf_log (this->name, GF_LOG_ERROR, - "There should be exactly 2 children - one source " - "and one sink"); - return -1; - } - priv->child_count = child_count; - - LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->read_child_lock); - - priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, - gf_afr_mt_char); - if (!priv->child_up) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - op_errno = ENOMEM; - goto out; - } - - priv->children = GF_CALLOC (sizeof (xlator_t *), child_count, - gf_afr_mt_xlator_t); - if (!priv->children) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - op_errno = ENOMEM; - goto out; - } - - priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key), - child_count, - gf_afr_mt_char); - if (!priv->pending_key) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - op_errno = ENOMEM; - goto out; - } - - trav = this->children; - i = 0; - while (i < child_count) { - priv->children[i] = trav->xlator; - - ret = asprintf (&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX, - trav->xlator->name); - if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed to set pending key"); - op_errno = ENOMEM; - goto out; - } - - trav = trav->next; - i++; - } - - priv->first_lookup = 1; - priv->root_inode = NULL; - - pump_priv = GF_CALLOC (1, sizeof (*pump_priv), - gf_afr_mt_pump_priv); - if (!pump_priv) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - op_errno = ENOMEM; - goto out; - } - - LOCK_INIT (&pump_priv->resume_path_lock); - LOCK_INIT (&pump_priv->pump_state_lock); - - pump_priv->resume_path = GF_CALLOC (1, PATH_MAX, - gf_afr_mt_char); - if (!pump_priv->resume_path) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - ret = -1; - goto out; - } - - pump_priv->env = syncenv_new (0); - if (!pump_priv->env) { - gf_log (this->name, GF_LOG_ERROR, - "Could not create new sync-environment"); - ret = -1; - goto out; - } - - priv->pump_private = pump_priv; - - pthread_mutex_init (&priv->mutex, NULL); - INIT_LIST_HEAD (&priv->saved_fds); - - pump_change_state (this, PUMP_STATE_ABORT); - - ret = 0; -out: - return ret; -} - -int -fini (xlator_t *this) -{ - return 0; -} - - -struct xlator_fops fops = { - .lookup = pump_lookup, - .open = pump_open, - .flush = pump_flush, - .fsync = pump_fsync, - .fsyncdir = pump_fsyncdir, - .xattrop = pump_xattrop, - .fxattrop = pump_fxattrop, - .getxattr = pump_getxattr, - - /* inode write */ - .writev = pump_writev, - .truncate = pump_truncate, - .ftruncate = pump_ftruncate, - .setxattr = pump_setxattr, - .setattr = pump_setattr, - .fsetattr = pump_fsetattr, - .removexattr = pump_removexattr, - - /* dir read */ - .opendir = pump_opendir, - .readdir = pump_readdir, - .readdirp = pump_readdirp, - - /* dir write */ - .create = pump_create, - .mknod = pump_mknod, - .mkdir = pump_mkdir, - .unlink = pump_unlink, - .rmdir = pump_rmdir, - .link = pump_link, - .symlink = pump_symlink, - .rename = pump_rename, -}; - -struct xlator_dumpops dumpops = { - .priv = afr_priv_dump, -}; - - -struct xlator_cbks cbks = { - .release = pump_release, - .releasedir = pump_releasedir, -}; - -struct volume_options options[] = { - { .key = {NULL} }, -}; diff --git a/xlators/cluster/afr/src/pump.h b/xlators/cluster/afr/src/pump.h deleted file mode 100644 index a46f9d7a542..00000000000 --- a/xlators/cluster/afr/src/pump.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef __PUMP_H__ -#define __PUMP_H__ - -#include "syncop.h" - -/* FIXME: Needs to be defined in a common file */ -#define CLIENT_CMD_CONNECT "trusted.glusterfs.client-connect" -#define CLIENT_CMD_DISCONNECT "trusted.glusterfs.client-disconnect" - -#define PUMP_PID 696969 -#define PUMP_LK_OWNER 696969 - -#define IS_ROOT_PATH(path) (!strcmp (path, "/")) -#define IS_ENTRY_CWD(entry) (!strcmp (entry, ".")) -#define IS_ENTRY_PARENT(entry) (!strcmp (entry, "..")) - -#define PUMP_CMD_START "trusted.glusterfs.pump.start" -#define PUMP_CMD_ABORT "trusted.glusterfs.pump.abort" -#define PUMP_CMD_PAUSE "trusted.glusterfs.pump.pause" -#define PUMP_CMD_STATUS "trusted.glusterfs.pump.status" - -#define PUMP_SOURCE_COMPLETE "trusted.glusterfs.pump-source-complete" -#define PUMP_SINK_COMPLETE "trusted.glusterfs.pump-sink-complete" - -#define PUMP_PATH "trusted.glusterfs.pump-path" - -#define PUMP_SOURCE_CHILD(xl) (xl->children->xlator) -#define PUMP_SINK_CHILD(xl) (xl->children->next->xlator) - -typedef enum { - PUMP_STATE_RUNNING, /* Pump is running and migrating files */ - PUMP_STATE_RESUME, /* Pump is resuming from a previous pause */ - PUMP_STATE_PAUSE, /* Pump is paused */ - PUMP_STATE_ABORT, /* Pump is aborted */ -} pump_state_t; - -typedef struct _pump_private { - struct syncenv *env; /* The env pointer to the pump synctask */ - const char *resume_path; /* path to resume from the last pause */ - gf_lock_t resume_path_lock; /* Synchronize resume_path changes */ - gf_lock_t pump_state_lock; /* Synchronize pump_state changes */ - pump_state_t pump_state; /* State of pump */ - char current_file[PATH_MAX]; /* Current file being pumped */ - uint64_t number_files_pumped; /* Number of files pumped */ - gf_boolean_t pump_finished; /* Boolean to indicate pump termination */ - char pump_start_pending; /* Boolean to mark start pending until - CHILD_UP */ -} pump_private_t; - -void -build_root_loc (inode_t *inode, loc_t *loc); -int pump_start (call_frame_t *frame, xlator_t *this); - -gf_boolean_t -pump_command_start (xlator_t *this, dict_t *dict); - -int -pump_execute_start (call_frame_t *frame, xlator_t *this); - -gf_boolean_t -pump_command_pause (xlator_t *this, dict_t *dict); - -int -pump_execute_pause (call_frame_t *frame, xlator_t *this); - -gf_boolean_t -pump_command_abort (xlator_t *this, dict_t *dict); - -int -pump_execute_abort (call_frame_t *frame, xlator_t *this); - -gf_boolean_t -pump_command_status (xlator_t *this, dict_t *dict); - -int -pump_execute_status (call_frame_t *frame, xlator_t *this); - -#endif /* __PUMP_H__ */ diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am index 8ebcab04451..56f1f2ad7c8 100644 --- a/xlators/cluster/dht/src/Makefile.am +++ b/xlators/cluster/dht/src/Makefile.am @@ -1,36 +1,48 @@ - xlator_LTLIBRARIES = dht.la nufa.la switch.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster +AM_CFLAGS = -Wall $(GF_CFLAGS) -dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c \ - dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \ - $(top_builddir)/xlators/lib/src/libxlator.c +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster -dht_la_SOURCES = $(dht_common_source) dht.c +dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \ + dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \ + dht-common.c dht-inode-write.c dht-inode-read.c dht-shared.c \ + dht-lock.c $(top_builddir)/xlators/lib/src/libxlator.c + +dht_la_SOURCES = $(dht_common_source) dht.c nufa_la_SOURCES = $(dht_common_source) nufa.c switch_la_SOURCES = $(dht_common_source) switch.c -dht_la_LDFLAGS = -module -avoidversion +dht_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -nufa_la_LDFLAGS = -module -avoidversion +nufa_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -switch_la_LDFLAGS = -module -avoidversion +switch_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = dht-common.h dht-common.c dht-mem-types.h $(top_builddir)/xlators/lib/src/libxlator.h +noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h \ + dht-lock.h $(top_builddir)/xlators/lib/src/libxlator.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) \ - -I$(top_srcdir)/xlators/lib/src +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(top_srcdir)/xlators/lib/src \ + -DDATADIR=\"$(localstatedir)\" \ + -DLIBDIR=\"$(libdir)\" -CLEANFILES = +CLEANFILES = uninstall-local: rm -f $(DESTDIR)$(xlatordir)/distribute.so install-data-hook: ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so + +if UNITTEST +CLEANFILES += *.gcda *.gcno *_xunit.xml +noinst_PROGRAMS = +TESTS = +endif diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 5042ad06555..8ba0cc4c732 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -1,79 +1,446 @@ /* - Copyright (c) 2009-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - /* TODO: add NS locking */ -#include "glusterfs.h" -#include "xlator.h" #include "libxlator.h" #include "dht-common.h" -#include "defaults.h" -#include "byte-order.h" +#include "dht-lock.h" +#include <glusterfs/byte-order.h> +#include <glusterfs/quota-common-utils.h> +#include <glusterfs/upcall-utils.h> +#include "glusterfs/compat-errno.h" // for ENODATA on BSD +#include <glusterfs/common-utils.h> #include <sys/time.h> #include <libgen.h> +#include <signal.h> +static int +dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata); -void -dht_aggregate (dict_t *this, char *key, data_t *value, void *data) +static int +dht_link2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret); + +static int +dht_set_dir_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req); + +static int +dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this); + +static int +dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata); + +static int +dht_rmdir_unlock(call_frame_t *frame, xlator_t *this); + +static const char *dht_dbg_vxattrs[] = {DHT_DBG_HASHED_SUBVOL_PATTERN, NULL}; + +/* Check the xdata to make sure EBADF has been set by client xlator */ +int32_t +dht_check_remote_fd_failed_error(dht_local_t *local, int op_ret, int op_errno) { - dict_t *dst = NULL; - int64_t *ptr = 0, size = 0; - int32_t ret = -1; + if (op_ret == -1 && (op_errno == EBADF || op_errno == EBADFD) && + !(local->fd_checked)) { + return 1; + } + return 0; +} - dst = data; +/* Sets the blocks and size values to fixed values. This is to be called + * only for dirs. The caller is responsible for checking the type + */ +int32_t +dht_set_fixed_dir_stat(struct iatt *stat) +{ + if (stat) { + stat->ia_blocks = DHT_DIR_STAT_BLOCKS; + stat->ia_size = DHT_DIR_STAT_SIZE; + return 0; + } + return -1; +} - if (strncmp (key, GF_XATTR_QUOTA_SIZE_KEY, - strlen (GF_XATTR_QUOTA_SIZE_KEY)) == 0) { - ret = dict_get_bin (dst, key, (void **)&ptr); - if (ret == 0) { - size = ntoh64 (*ptr); - } +/* Return true if key exists in array + */ +static gf_boolean_t +dht_match_xattr(const char *key) +{ + char **xattrs_to_heal = get_xattrs_to_heal(); + + return gf_get_index_by_elem(xattrs_to_heal, (char *)key) >= 0; +} + +static int +dht_aggregate_quota_xattr(dict_t *dst, char *key, data_t *value) +{ + int ret = -1; + quota_meta_t *meta_dst = NULL; + quota_meta_t *meta_src = NULL; + int64_t *size = NULL; + int64_t dst_dir_count = 0; + int64_t src_dir_count = 0; + + if (value == NULL) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DATA_NULL, + "data value is NULL"); + ret = -1; + goto out; + } + + ret = dict_get_bin(dst, key, (void **)&meta_dst); + if (ret < 0) { + meta_dst = GF_CALLOC(1, sizeof(quota_meta_t), gf_common_quota_meta_t); + if (meta_dst == NULL) { + gf_msg("dht", GF_LOG_WARNING, ENOMEM, DHT_MSG_NO_MEMORY, + "Memory allocation failed"); + ret = -1; + goto out; + } + ret = dict_set_bin(dst, key, meta_dst, sizeof(quota_meta_t)); + if (ret < 0) { + gf_msg("dht", GF_LOG_WARNING, EINVAL, DHT_MSG_DICT_SET_FAILED, + "dht aggregate dict set failed"); + GF_FREE(meta_dst); + ret = -1; + goto out; + } + } + + if (value->len > sizeof(int64_t)) { + meta_src = data_to_bin(value); - ptr = data_to_bin (value); + meta_dst->size = hton64(ntoh64(meta_dst->size) + + ntoh64(meta_src->size)); + meta_dst->file_count = hton64(ntoh64(meta_dst->file_count) + + ntoh64(meta_src->file_count)); - size += ntoh64 (*ptr); + if (value->len > (2 * sizeof(int64_t))) { + dst_dir_count = ntoh64(meta_dst->dir_count); + src_dir_count = ntoh64(meta_src->dir_count); - *ptr = hton64 (*ptr); - ret = dict_set_bin (dst, key, ptr, sizeof (int64_t)); + if (src_dir_count > dst_dir_count) + meta_dst->dir_count = meta_src->dir_count; + } else { + meta_dst->dir_count = 0; } + } else { + size = data_to_bin(value); + meta_dst->size = hton64(ntoh64(meta_dst->size) + ntoh64(*size)); + } - return; + ret = 0; +out: + return ret; } +static int +add_opt(char **optsp, const char *opt) +{ + char *newopts = NULL; + unsigned oldsize = 0; + unsigned newsize = 0; + + if (*optsp == NULL) + newopts = gf_strdup(opt); + else { + oldsize = strlen(*optsp); + newsize = oldsize + 1 + strlen(opt) + 1; + newopts = GF_REALLOC(*optsp, newsize); + if (newopts) + sprintf(newopts + oldsize, ",%s", opt); + } + if (newopts == NULL) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to add choices in buffer in add_opt"); + return -1; + } + *optsp = newopts; + return 0; +} -void -dht_aggregate_xattr (dict_t *dst, dict_t *src) +/* Return Choice list from Split brain status */ +static char * +getChoices(const char *value) { - if ((dst == NULL) || (src == NULL)) { + int i = 0; + char *ptr = NULL; + char *tok = NULL; + char *result = NULL; + char *newval = NULL; + + ptr = strstr(value, "Choices:"); + if (!ptr) { + result = ptr; + goto out; + } + + newval = gf_strdup(ptr); + if (!newval) { + result = newval; + goto out; + } + + tok = strtok(newval, ":"); + if (!tok) { + result = tok; + goto out; + } + + while (tok) { + i++; + if (i == 2) + break; + tok = strtok(NULL, ":"); + } + + result = gf_strdup(tok); + +out: + if (newval) + GF_FREE(newval); + + return result; +} + +/* This function prepare a list of choices for key + (replica.split-brain-status) in case of metadata split brain + only on the basis of key-value passed to this function. + After prepare the list of choices it update the same key in dict + with this value to reflect the same in + replica.split-brain-status attr for file. + +*/ + +static int +dht_aggregate_split_brain_xattr(dict_t *dst, char *key, data_t *value) +{ + int ret = 0; + char *oldvalue = NULL; + char *old_choice = NULL; + char *new_choice = NULL; + char *full_choice = NULL; + char *status = NULL; + + if (value == NULL) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DATA_NULL, + "GF_AFR_SBRAIN_STATUS value is NULL"); + ret = -1; + goto out; + } + + ret = dict_get_str(dst, key, &oldvalue); + if (ret) + goto out; + + /* skip code that is irrelevant if !oldvalue */ + if (!oldvalue) + goto out; + + if (strstr(oldvalue, "not")) { + gf_msg_debug("dht", 0, "Need to update split-brain status in dict"); + ret = -1; + goto out; + } + if (strstr(oldvalue, "metadata-split-brain:yes") && + (strstr(oldvalue, "data-split-brain:no"))) { + if (strstr(value->data, "not")) { + gf_msg_debug("dht", 0, "No need to update split-brain status"); + ret = 0; + goto out; + } + if (strstr(value->data, "yes") && + (strncmp(oldvalue, value->data, strlen(oldvalue)))) { + old_choice = getChoices(oldvalue); + if (!old_choice) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to get choices"); + ret = -1; + goto out; + } + + ret = add_opt(&full_choice, old_choice); + if (ret) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to add choices"); + ret = -1; + goto out; + } + + new_choice = getChoices(value->data); + if (!new_choice) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to get choices"); + ret = -1; + goto out; + } + + ret = add_opt(&full_choice, new_choice); + if (ret) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to add choices "); + ret = -1; + goto out; + } + ret = gf_asprintf(&status, + "data-split-brain:%s " + "metadata-split-brain:%s Choices:%s", + "no", "yes", full_choice); + + if (-1 == ret) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to prepare status "); goto out; + } + ret = dict_set_dynstr(dst, key, status); + if (ret) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set full choice"); + } } + } - dict_foreach (src, dht_aggregate, dst); out: - return; + if (old_choice) + GF_FREE(old_choice); + if (new_choice) + GF_FREE(new_choice); + if (full_choice) + GF_FREE(full_choice); + + return ret; +} + +static int +dht_aggregate(dict_t *this, char *key, data_t *value, void *data) +{ + dict_t *dst = NULL; + int32_t ret = -1; + data_t *dict_data = NULL; + + dst = data; + + /* compare split brain xattr only */ + if (strcmp(key, GF_AFR_SBRAIN_STATUS) == 0) { + ret = dht_aggregate_split_brain_xattr(dst, key, value); + if (!ret) + goto out; + } else if (strcmp(key, QUOTA_SIZE_KEY) == 0) { + ret = dht_aggregate_quota_xattr(dst, key, value); + if (ret) { + gf_msg("dht", GF_LOG_WARNING, 0, + DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED, + "Failed to aggregate quota xattr"); + } + goto out; + } else if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) { + ret = gf_get_min_stime(THIS, dst, key, value); + goto out; + } else { + /* compare user xattrs only */ + if (!strncmp(key, "user.", SLEN("user."))) { + ret = dict_lookup(dst, key, &dict_data); + if (!ret && dict_data && value) { + ret = is_data_equal(dict_data, value); + if (!ret) + gf_msg_debug("dht", 0, "xattr mismatch for %s", key); + } + } + } + + ret = dict_set(dst, key, value); + if (ret) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value: key = %s", key); + } + +out: + return ret; +} + +static void +dht_aggregate_xattr(dict_t *dst, dict_t *src) +{ + if ((dst == NULL) || (src == NULL)) { + goto out; + } + + dict_foreach(src, dht_aggregate, dst); +out: + return; +} + +/* Code to save hashed subvol on inode ctx as a mds subvol + */ +int +dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + uint64_t ctx_int = 0; + gf_boolean_t ctx_free = _gf_false; + + LOCK(&inode->lock); + { + ret = __inode_ctx_get(inode, this, &ctx_int); + if (ctx_int) { + ctx = (dht_inode_ctx_t *)(uintptr_t)ctx_int; + ctx->mds_subvol = mds_subvol; + } else { + ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + goto unlock; + ctx->mds_subvol = mds_subvol; + ctx_free = _gf_true; + ctx_int = (long)ctx; + ret = __inode_ctx_set(inode, this, &ctx_int); + } + } +unlock: + UNLOCK(&inode->lock); + if (ret && ctx_free) + GF_FREE(ctx); + return ret; +} + +/*Code to get mds subvol from inode ctx */ + +int +dht_inode_ctx_mdsvol_get(inode_t *inode, xlator_t *this, xlator_t **mdsvol) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + + if (!mdsvol) + return ret; + + if (__is_root_gfid(inode->gfid)) { + (*mdsvol) = FIRST_CHILD(this); + return 0; + } + + ret = dht_inode_ctx_get(inode, this, &ctx); + + if (!ret && ctx) { + if (ctx->mds_subvol) { + *mdsvol = ctx->mds_subvol; + ret = 0; + } else { + ret = -1; + } + } + + return ret; } /* TODO: @@ -83,5123 +450,10942 @@ out: - complete linkfile selfheal */ - -int -dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int op_ret, int op_errno) +static int +dht_lookup_selfheal_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int ret = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + int ret = -1; - GF_VALIDATE_OR_GOTO ("dht", frame, out); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); - local = frame->local; - ret = op_ret; + local = frame->local; + conf = this->private; + ret = op_ret; - FRAME_SU_UNDO (frame, dht_local_t); + FRAME_SU_UNDO(frame, dht_local_t); - if (ret == 0) { - layout = local->selfheal.layout; - ret = dht_layout_set (this, local->inode, layout); + if (ret == 0) { + layout = local->selfheal.layout; + ret = dht_layout_set(this, local->inode, layout); + } - if (local->ia_ino) { - local->stbuf.ia_ino = local->ia_ino; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "could not find hashed subvolume for %s", - local->loc.path); - } + dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1); + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, &local->postparent, + 1); + } + + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + /* Delete mds xattr at the time of STACK UNWIND */ + GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr); + + DHT_STACK_UNWIND(lookup, frame, ret, local->op_errno, local->inode, + &local->stbuf, local->xattr, &local->postparent); + +out: + return ret; +} - if (local->loc.parent) - local->postparent.ia_ino = local->loc.parent->ino; +static int +dht_discover_complete(xlator_t *this, call_frame_t *discover_frame) +{ + dht_local_t *local = NULL; + dht_local_t *heal_local = NULL; + call_frame_t *main_frame = NULL; + call_frame_t *heal_frame = NULL; + int op_errno = 0; + int ret = -1; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + uint32_t vol_commit_hash = 0; + xlator_t *source = NULL; + int heal_path = 0; + int error_while_marking_mds = 0; + int i = 0; + loc_t loc = {0}; + int8_t is_read_only = 0, layout_anomalies = 0; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + + local = discover_frame->local; + layout = local->layout; + conf = this->private; + gf_uuid_unparse(local->gfid, gfid_local); + + LOCK(&discover_frame->lock); + { + main_frame = local->main_frame; + local->main_frame = NULL; + } + UNLOCK(&discover_frame->lock); + + if (!main_frame) + return 0; + + /* Code to update all extended attributed from + subvol to local->xattr on that internal xattr has found + */ + if (conf->subvolume_cnt == 1) + local->need_xattr_heal = 0; + if (local->need_xattr_heal && (local->mds_xattr)) { + dht_dir_set_heal_xattr(this, local, local->xattr, local->mds_xattr, + NULL, NULL); + dict_unref(local->mds_xattr); + local->mds_xattr = NULL; + } + + ret = dict_get_int8(local->xattr_req, QUOTA_READ_ONLY_KEY, &is_read_only); + if (ret < 0) + gf_msg_debug(this->name, 0, "key = %s not present in dict", + QUOTA_READ_ONLY_KEY); + + if (local->file_count && local->dir_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_FILE_TYPE_MISMATCH, + "path %s exists as a file on one subvolume " + "and directory on another. " + "Please fix it manually", + local->loc.path); + op_errno = EIO; + goto out; + } + + if (local->cached_subvol) { + ret = dht_layout_preset(this, local->cached_subvol, local->inode); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SET_FAILED, + "failed to set layout for subvolume %s", + local->cached_subvol ? local->cached_subvol->name : "<nil>"); + op_errno = EINVAL; + goto out; + } + } else { + ret = dht_layout_normalize(this, &local->loc, layout); + if ((ret < 0) || ((ret > 0) && (local->op_ret != 0))) { + /* either the layout is incorrect or the directory is + * not found even in one subvolume. + */ + gf_msg_debug(this->name, 0, + "normalizing failed on %s " + "(overlaps/holes present: %s, " + "ENOENT errors: %d)", + local->loc.path, (ret < 0) ? "yes" : "no", + (ret > 0) ? ret : 0); + layout_anomalies = 1; + } else if (local->inode) { + dht_layout_set(this, local->inode, layout); + } + } + + if (!conf->vch_forced) { + ret = dict_get_uint32(local->xattr, conf->commithash_xattr_name, + &vol_commit_hash); + if (ret == 0) { + conf->vol_commit_hash = vol_commit_hash; } + } - WIPE (&local->postparent); + if (IA_ISDIR(local->stbuf.ia_type) && !is_read_only) { + for (i = 0; i < layout->cnt; i++) { + if (!source && !layout->list[i].err) + source = layout->list[i].xlator; + if (layout->list[i].err == ENOENT || + layout->list[i].err == ESTALE) { + heal_path = 1; + } + + if (source && heal_path) + break; + } + } + + if (IA_ISDIR(local->stbuf.ia_type)) { + /* Call function to save hashed subvol on inode ctx if + internal mds xattr is not present and all subvols are up + */ + if (!local->op_ret && !__is_root_gfid(local->stbuf.ia_gfid)) + (void)dht_common_mark_mdsxattr(discover_frame, + &error_while_marking_mds, 1); + + if (local->need_xattr_heal && !heal_path) { + local->need_xattr_heal = 0; + ret = dht_dir_xattr_heal(this, local, &op_errno); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_DIR_XATTR_HEAL_FAILED, + "xattr heal failed for " + "directory gfid is %s ", + gfid_local); + } + } + } + + if (source && (heal_path || layout_anomalies || error_while_marking_mds)) { + gf_uuid_copy(loc.gfid, local->gfid); + if (gf_uuid_is_null(loc.gfid)) { + goto done; + } + + if (local->inode) + loc.inode = inode_ref(local->inode); + else + goto done; + + heal_frame = create_frame(this, this->ctx->pool); + if (heal_frame) { + heal_local = dht_local_init(heal_frame, &loc, NULL, 0); + if (!heal_local) + goto cleanup; + + gf_uuid_copy(heal_local->gfid, local->gfid); + heal_frame->cookie = source; + heal_local->xattr = dict_ref(local->xattr); + heal_local->stbuf = local->stbuf; + heal_local->postparent = local->postparent; + heal_local->inode = inode_ref(loc.inode); + heal_local->main_frame = main_frame; + FRAME_SU_DO(heal_frame, dht_local_t); + ret = synctask_new(this->ctx->env, dht_heal_full_path, + dht_heal_full_path_done, heal_frame, heal_frame); + if (!ret) { + loc_wipe(&loc); + return 0; + } + /* + * Failed to spawn the synctask. Returning + * with out doing heal. + */ + cleanup: + loc_wipe(&loc); + DHT_STACK_DESTROY(heal_frame); + } + } +done: + dht_set_fixed_dir_stat(&local->postparent); + /* Delete mds xattr at the time of STACK UNWIND */ + if (local->xattr) + GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr); - DHT_STACK_UNWIND (lookup, frame, ret, local->op_errno, local->inode, - &local->stbuf, local->xattr, &local->postparent); + DHT_STACK_UNWIND(lookup, main_frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + return 0; out: - return ret; + DHT_STACK_UNWIND(lookup, main_frame, -1, op_errno, NULL, NULL, NULL, NULL); + + return ret; } +static int +dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = cookie; + int ret = -1; + dht_conf_t *conf = 0; + dht_layout_t *layout = NULL; + int32_t mds_heal_fresh_lookup = 0; + + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + + local = frame->local; + conf = this->private; + layout = local->selfheal.layout; + mds_heal_fresh_lookup = local->mds_heal_fresh_lookup; + + if (op_ret) { + gf_msg_debug(this->name, op_ret, + "Failed to set %s on the MDS %s for path %s. ", + conf->mds_xattr_key, prev->name, local->loc.path); + } else { + /* Save mds subvol on inode ctx */ + ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "Failed to set mds subvol on inode ctx" + " %s for %s ", + prev->name, local->loc.path); + } + } + if (!local->mds_heal_fresh_lookup && layout) { + dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf, 0xffffffff, + layout); + } +out: + if (mds_heal_fresh_lookup) + DHT_STACK_DESTROY(frame); + return 0; +} -int -dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) -{ - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - int ret = -1; - int is_dir = 0; - - GF_VALIDATE_OR_GOTO ("dht", frame, out); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", this->private, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); - - conf = this->private; - local = frame->local; - prev = cookie; +static xlator_t * +dht_inode_get_hashed_subvol(inode_t *inode, xlator_t *this, loc_t *loc) +{ + char *path = NULL; + loc_t populate_loc = { + 0, + }; + char *name = NULL; + xlator_t *hash_subvol = NULL; + + if (!inode) + return hash_subvol; + + if (loc && loc->parent && loc->path) { + if (!loc->name) { + name = strrchr(loc->path, '/'); + if (name) { + loc->name = name + 1; + } else { + goto out; + } + } + hash_subvol = dht_subvol_get_hashed(this, loc); + goto out; + } - layout = local->layout; + if (!gf_uuid_is_null(inode->gfid)) { + populate_loc.inode = inode_ref(inode); + populate_loc.parent = inode_parent(populate_loc.inode, NULL, NULL); + inode_path(populate_loc.inode, NULL, &path); - if (!op_ret && uuid_is_null (local->gfid)) - memcpy (local->gfid, stbuf->ia_gfid, 16); + if (!path) + goto out; - LOCK (&frame->lock); - { - /* TODO: assert equal mode on stbuf->st_mode and - local->stbuf->st_mode + populate_loc.path = path; + if (!populate_loc.name && populate_loc.path) { + name = strrchr(populate_loc.path, '/'); + if (name) { + populate_loc.name = name + 1; - else mkdir/chmod/chown and fix - */ - ret = dht_layout_merge (this, layout, prev->this, - op_ret, op_errno, xattr); - - if (op_ret == -1) { - local->op_errno = ENOENT; - gf_log (this->name, GF_LOG_DEBUG, - "lookup of %s on %s returned error (%s)", - local->loc.path, prev->this->name, - strerror (op_errno)); + } else { + goto out; + } + } + hash_subvol = dht_subvol_get_hashed(this, &populate_loc); + } +out: + if (populate_loc.inode) + loc_wipe(&populate_loc); + return hash_subvol; +} - goto unlock; +/* Common function call by revalidate/selfheal code path to populate + internal xattr if it is not present, mark_during_fresh_lookup value + determines either function is call by revalidate_cbk(discover_complete) + or call by selfheal code path while fresh lookup. + Here we do wind a call serially in case of fresh lookup and + for other lookup code path we do wind a call parallel.The reason + to wind a call serially is at the time of fresh lookup directory is not + discovered and at the time of revalidate_lookup directory is + already discovered. So, revalidate codepath can race with setxattr + codepath and can get into spurious heals because of an ongoing setxattr. + This can slow down revalidates, if healing happens in foreground. + However, if healing happens in background, there is no direct performance + penalty. +*/ +int +dht_common_mark_mdsxattr(call_frame_t *frame, int *errst, + int mark_during_fresh_lookup) +{ + dht_local_t *local = NULL; + xlator_t *this = NULL; + xlator_t *hashed_subvol = NULL; + int ret = 0; + int i = 0; + dict_t *xattrs = NULL; + char gfid_local[GF_UUID_BUF_SIZE] = { + 0, + }; + int32_t zero[1] = {0}; + dht_conf_t *conf = 0; + dht_layout_t *layout = NULL; + dht_local_t *copy_local = NULL; + call_frame_t *xattr_frame = NULL; + gf_boolean_t vol_down = _gf_false; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + this = frame->this; + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + local = frame->local; + conf = this->private; + layout = local->selfheal.layout; + local->mds_heal_fresh_lookup = mark_during_fresh_lookup; + + gf_uuid_unparse(local->gfid, gfid_local); + + /* Code to update hashed subvol consider as a mds subvol + and wind a setxattr call on hashed subvol to update + internal xattr + */ + if (!local->xattr || !dict_get(local->xattr, conf->mds_xattr_key)) { + /* It means no internal MDS xattr has been set yet + */ + /* Check the status of all subvol are up while call + this function call by lookup code path + */ + if (mark_during_fresh_lookup) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->subvolume_status[i]) { + vol_down = _gf_true; + break; } + } + if (vol_down) { + gf_msg_debug(this->name, 0, + "subvol %s is down. Unable to " + " save mds subvol on inode for " + " path %s gfid is %s ", + conf->subvolumes[i]->name, local->loc.path, + gfid_local); + goto out; + } + } - is_dir = check_is_dir (inode, stbuf, xattr); - if (!is_dir) { - gf_log (this->name, GF_LOG_DEBUG, - "lookup of %s on %s returned non dir 0%o", - local->loc.path, prev->this->name, - stbuf->ia_type); - local->need_selfheal = 1; - goto unlock; - } + /* Calculate hashed subvol based on inode and parent node + */ + hashed_subvol = dht_inode_get_hashed_subvol(local->inode, this, + &local->loc); + if (!hashed_subvol) { + gf_msg(this->name, GF_LOG_DEBUG, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Failed to get hashed subvol for path %s" + "gfid is %s ", + local->loc.path, gfid_local); + if (errst) + (*errst) = 1; + ret = -1; + goto out; + } + xattrs = dict_new(); + if (!xattrs) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "dict_new failed"); + ret = -1; + goto out; + } + /* Add internal MDS xattr on disk for hashed subvol + */ + ret = dht_dict_set_array(xattrs, conf->mds_xattr_key, zero, 1); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary" + " value:key = %s for " + "path %s", + conf->mds_xattr_key, local->loc.path); + ret = -1; + goto out; + } + /* Create a new frame to wind a call only while + this function call by revalidate_cbk code path + To wind a call parallel need to create a new frame + */ + if (mark_during_fresh_lookup) { + xattr_frame = create_frame(this, this->ctx->pool); + if (!xattr_frame) { + ret = -1; + goto out; + } + copy_local = dht_local_init(xattr_frame, &(local->loc), NULL, 0); + if (!copy_local) { + ret = -1; + DHT_STACK_DESTROY(xattr_frame); + goto out; + } + copy_local->stbuf = local->stbuf; + copy_local->mds_heal_fresh_lookup = mark_during_fresh_lookup; + if (!copy_local->inode) + copy_local->inode = inode_ref(local->inode); + gf_uuid_copy(copy_local->loc.gfid, local->gfid); + FRAME_SU_DO(xattr_frame, dht_local_t); + STACK_WIND_COOKIE(xattr_frame, dht_common_mark_mdsxattr_cbk, + hashed_subvol, hashed_subvol, + hashed_subvol->fops->setxattr, &local->loc, + xattrs, 0, NULL); + } else { + STACK_WIND_COOKIE(frame, dht_common_mark_mdsxattr_cbk, + (void *)hashed_subvol, hashed_subvol, + hashed_subvol->fops->setxattr, &local->loc, + xattrs, 0, NULL); + } + } else { + gf_msg_debug(this->name, 0, + "internal xattr %s is present on subvol" + "on path %s gfid is %s ", + conf->mds_xattr_key, local->loc.path, gfid_local); + if (!mark_during_fresh_lookup) + dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf, + 0xffffffff, layout); + } - local->op_ret = 0; - if (local->xattr == NULL) { - local->xattr = dict_ref (xattr); - } else { - dht_aggregate_xattr (local->xattr, xattr); - } +out: + if (xattrs) + dict_unref(xattrs); + return ret; +} - if (local->inode == NULL) - local->inode = inode_ref (inode); +/* Get the value of key from dict in the bytewise and save in array after + convert from network byte order to host byte order +*/ +static int32_t +dht_dict_get_array(dict_t *dict, char *key, int32_t value[], int32_t size, + int *errst) +{ + void *ptr = NULL; + int32_t len = -1; + int32_t vindex = -1; + int32_t err = -1; + int ret = 0; + + if (dict == NULL) { + (*errst) = -1; + return -EINVAL; + } + err = dict_get_ptr_and_len(dict, key, &ptr, &len); + if (err != 0) { + (*errst) = -1; + return err; + } + + if (len != (size * sizeof(int32_t))) { + (*errst) = -1; + return -EINVAL; + } + + for (vindex = 0; vindex < size; vindex++) { + value[vindex] = ntoh32(*((int32_t *)ptr + vindex)); + if (value[vindex] < 0) + ret = -1; + } + + return ret; +} - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->postparent, postparent, - prev->this); +static int +dht_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + int is_dir = 0; + int32_t check_mds = 0; + int is_linkfile = 0; + int attempt_unwind = 0; + dht_conf_t *conf = 0; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + char gfid_node[GF_UUID_BUF_SIZE] = {0}; + int32_t mds_xattr_val[1] = {0}; + int errst = 0; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", this->private, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + + local = frame->local; + prev = cookie; + conf = this->private; + + layout = local->layout; + + /* Check if the gfid is different for file from other node */ + if (!op_ret && gf_uuid_compare(local->gfid, stbuf->ia_gfid)) { + gf_uuid_unparse(stbuf->ia_gfid, gfid_node); + gf_uuid_unparse(local->gfid, gfid_local); + + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH, + "%s: gfid different on %s, gfid local = %s" + "gfid other = %s", + local->loc.path, prev->name, gfid_local, gfid_node); + } + + LOCK(&frame->lock); + { + /* TODO: assert equal mode on stbuf->st_mode and + local->stbuf->st_mode + + else mkdir/chmod/chown and fix + */ + + ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED, + "%s: failed to merge layouts for subvol %s", local->loc.path, + prev->name); - if (prev->this == dht_first_up_subvol (this)) { - local->ia_ino = local->stbuf.ia_ino; - } + if (op_ret == -1) { + local->op_errno = op_errno; + gf_msg_debug(this->name, op_errno, + "lookup of %s on %s returned error", local->loc.path, + prev->name); + goto unlock; } -unlock: - UNLOCK (&frame->lock); + is_linkfile = check_is_linkfile(inode, stbuf, xattr, + conf->link_xattr_name); + is_dir = check_is_dir(inode, stbuf, xattr); - this_call_cnt = dht_frame_return (frame); + if (is_dir) { + local->dir_count++; + } else { + local->file_count++; + + if (!is_linkfile && !local->cached_subvol) { + /* real file */ + /* Ok, we somehow managed to find a file on + * more than one subvol. ignore this or we + * will end up overwriting information while a + * a thread is potentially unwinding from + * dht_discover_complete + */ + local->cached_subvol = prev; + attempt_unwind = 1; + } else { + goto unlock; + } + } - if (is_last_call (this_call_cnt)) { - if (local->need_selfheal) { - local->need_selfheal = 0; - dht_lookup_everywhere (frame, this, &local->loc); - return 0; - } + local->op_ret = 0; - if (local->op_ret == 0) { - ret = dht_layout_normalize (this, &local->loc, layout); + if (local->xattr == NULL) { + local->xattr = dict_ref(xattr); + } else { + /* Don't aggregate for files. See BZ#1484709 */ + if (is_dir) + dht_aggregate_xattr(local->xattr, xattr); + } - if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "fixing assignment on %s", - local->loc.path); - goto selfheal; - } + if (local->inode == NULL) + local->inode = inode_ref(inode); - dht_layout_set (this, local->inode, layout); + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->postparent, postparent); - if (local->ia_ino) { - local->stbuf.ia_ino = local->ia_ino; - } else { - gf_log (this->name, GF_LOG_DEBUG, - "could not find hashed subvol for %s", - local->loc.path); - } + if (!dict_get(xattr, conf->mds_xattr_key)) { + goto unlock; + } else { + gf_msg_debug(this->name, 0, + "internal xattr %s is present on subvol" + "on path %s gfid is %s ", + conf->mds_xattr_key, local->loc.path, gfid_local); + } + check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key, + mds_xattr_val, 1, &errst); + /* save mds subvol on inode ctx */ + ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "Failed to set hashed subvol for %s vol is %s", + local->loc.path, prev->name); + } + + if ((check_mds < 0) && !errst) { + local->mds_xattr = dict_ref(xattr); + gf_msg_debug(this->name, 0, + "Value of %s is not zero on mds subvol" + "so xattr needs to be healed on non mds" + " path is %s and vol name is %s " + " gfid is %s", + conf->mds_xattr_key, local->loc.path, prev->name, + gfid_local); + local->need_xattr_heal = 1; + local->mds_subvol = prev; + } + } +unlock: + UNLOCK(&frame->lock); +out: + /* Make sure, the thread executing dht_discover_complete is the one + * which calls STACK_DESTROY (frame). In the case of "attempt_unwind", + * this makes sure that the thread don't call dht_frame_return, till + * call to dht_discover_complete is done. + */ + if (attempt_unwind) { + dht_discover_complete(this, frame); + } - if (local->loc.parent) - local->postparent.ia_ino = - local->loc.parent->ino; - } + this_call_cnt = dht_frame_return(frame); - DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, local->xattr, - &local->postparent); - } + if (is_last_call(this_call_cnt) && !attempt_unwind) { + dht_discover_complete(this, frame); + } - return 0; + if (is_last_call(this_call_cnt)) + DHT_STACK_DESTROY(frame); -selfheal: - FRAME_SU_DO (frame, dht_local_t); - ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk, - &local->loc, layout); -out: - return ret; + return 0; } -int -dht_lookup_root_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) -{ - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - int ret = -1; - int is_dir = 0; - - GF_VALIDATE_OR_GOTO ("dht", frame, out); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", this->private, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); - - conf = this->private; - local = frame->local; - prev = cookie; - - layout = local->layout; +static int +dht_set_file_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + int ret = -EINVAL; + dht_conf_t *conf = NULL; + + conf = this->private; + if (!conf) { + goto err; + } + + if (!xattr_req) { + goto err; + } + + /* Used to check whether this is a linkto file. + */ + ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value:key = %s for " + "path %s", + conf->link_xattr_name, loc->path); + goto err; + } + + /* This is used to make sure we don't unlink linkto files + * which are the target of an ongoing file migration. + */ + ret = dict_set_uint32(xattr_req, GLUSTERFS_OPEN_FD_COUNT, 4); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value:key = %s for " + "path %s", + GLUSTERFS_OPEN_FD_COUNT, loc->path); + goto err; + } + + ret = 0; +err: + return ret; +} - LOCK (&frame->lock); - { - ret = dht_layout_merge (this, layout, prev->this, - op_ret, op_errno, xattr); - - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_ERROR, - "lookup of %s on %s returned error (%s)", - local->loc.path, prev->this->name, - strerror (op_errno)); - goto unlock; - } +/* This is a gfid based nameless lookup. Without a name, the hashed subvol + * cannot be calculated so a lookup is sent to all subvols. + */ +static int +dht_do_discover(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + int ret; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int call_cnt = 0; + int op_errno = EINVAL; + int i = 0; + call_frame_t *discover_frame = NULL; + + conf = this->private; + local = frame->local; + + /* As we do not know if this is a file or directory, request + * both file and directory xattrs + */ + ret = dht_set_file_xattr_req(this, loc, local->xattr_req); + if (ret) { + goto err; + } + + ret = dht_set_dir_xattr_req(this, loc, local->xattr_req); + if (ret) { + goto err; + } + + if (loc_is_root(loc)) { + /* Request the DHT commit hash xattr (trusted.glusterfs.dht.commithash) + * set on the brick root. + */ + ret = dict_set_uint32(local->xattr_req, conf->commithash_xattr_name, + sizeof(uint32_t)); + } - is_dir = check_is_dir (inode, stbuf, xattr); - if (!is_dir) { - gf_log (this->name, GF_LOG_CRITICAL, - "lookup of %s on %s returned non dir 0%o", - local->loc.path, prev->this->name, - stbuf->ia_type); - goto unlock; - } + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; - local->op_ret = 0; - if (local->xattr == NULL) - local->xattr = dict_ref (xattr); - if (local->inode == NULL) - local->inode = inode_ref (inode); + local->layout = dht_layout_new(this, conf->subvolume_cnt); - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + if (!local->layout) { + op_errno = ENOMEM; + goto err; + } - if (prev->this == dht_first_up_subvol (this)) { - local->ia_ino = local->stbuf.ia_ino; - } + gf_uuid_copy(local->gfid, loc->gfid); - } -unlock: - UNLOCK (&frame->lock); + discover_frame = copy_frame(frame); + if (!discover_frame) { + op_errno = ENOMEM; + goto err; + } + discover_frame->local = local; + frame->local = NULL; + local->main_frame = frame; - this_call_cnt = dht_frame_return (frame); + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(discover_frame, dht_discover_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, + local->xattr_req); + } - if (is_last_call (this_call_cnt)) { - if (local->op_ret == 0) { - ret = dht_layout_normalize (this, &local->loc, layout); - if (ret != 0) { - gf_log (this->name, GF_LOG_INFO, - "fixing assignment on %s", - local->loc.path); - } + return 0; - ret = dht_layout_set (this, local->inode, layout); - } +err: + DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); - DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, local->xattr, - &local->postparent); - } + return 0; +} +/* Code to call syntask to heal custom xattr from hashed subvol + to non hashed subvol +*/ +int +dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno) +{ + dht_local_t *copy_local = NULL; + call_frame_t *copy = NULL; + int ret = -1; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + + if (gf_uuid_is_null(local->gfid)) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DIR_XATTR_HEAL_FAILED, + "No gfid exists for path %s " + "so healing xattr is not possible", + local->loc.path); + *op_errno = EIO; + goto out; + } + + gf_uuid_unparse(local->gfid, gfid_local); + copy = create_frame(this, this->ctx->pool); + if (copy) { + copy_local = dht_local_init(copy, &(local->loc), NULL, 0); + if (!copy_local) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, + DHT_MSG_DIR_XATTR_HEAL_FAILED, + "Memory allocation failed " + "for path %s gfid %s ", + local->loc.path, gfid_local); + *op_errno = ENOMEM; + DHT_STACK_DESTROY(copy); + } else { + copy_local->stbuf = local->stbuf; + gf_uuid_copy(copy_local->loc.gfid, local->gfid); + copy_local->mds_subvol = local->mds_subvol; + FRAME_SU_DO(copy, dht_local_t); + ret = synctask_new(this->ctx->env, dht_dir_heal_xattrs, + dht_dir_heal_xattrs_done, copy, copy); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, + DHT_MSG_DIR_XATTR_HEAL_FAILED, + "Synctask creation failed to heal xattr " + "for path %s gfid %s ", + local->loc.path, gfid_local); + *op_errno = ENOMEM; + DHT_STACK_DESTROY(copy); + } + } + } out: - return ret; + return ret; } static int -dht_do_fresh_lookup_on_root (xlator_t *this, call_frame_t *frame) +dht_needs_selfheal(call_frame_t *frame, xlator_t *this) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - int call_cnt = 0; - int i = 0; - int op_errno = EINVAL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int needs_selfheal = 0; + int ret = 0; + + local = frame->local; + layout = local->layout; + + if (local->need_attrheal || local->need_xattr_heal || + local->need_selfheal) { + needs_selfheal = 1; + } + + ret = dht_layout_normalize(this, &local->loc, layout); + + if (ret != 0) { + gf_msg_debug(this->name, 0, "fixing assignment on %s", local->loc.path); + needs_selfheal = 1; + } + return needs_selfheal; +} - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame, unwind); - GF_VALIDATE_OR_GOTO ("dht", frame->local, unwind); +static int +is_permission_different(ia_prot_t *prot1, ia_prot_t *prot2) +{ + if ((prot1->owner.read != prot2->owner.read) || + (prot1->owner.write != prot2->owner.write) || + (prot1->owner.exec != prot2->owner.exec) || + (prot1->group.read != prot2->group.read) || + (prot1->group.write != prot2->group.write) || + (prot1->group.exec != prot2->group.exec) || + (prot1->other.read != prot2->other.read) || + (prot1->other.write != prot2->other.write) || + (prot1->other.exec != prot2->other.exec) || + (prot1->suid != prot2->suid) || (prot1->sgid != prot2->sgid) || + (prot1->sticky != prot2->sticky)) { + return 1; + } else { + return 0; + } +} - local = frame->local; - conf = this->private; - if (!conf) - goto err; +int +dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + int is_dir = 0; + int32_t check_mds = 0; + int errst = 0; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + char gfid_node[GF_UUID_BUF_SIZE] = {0}; + int32_t mds_xattr_val[1] = {0}; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", this->private, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + + local = frame->local; + prev = cookie; + conf = this->private; + + layout = local->layout; + gf_msg_debug(this->name, op_errno, + "%s: lookup on %s returned with op_ret = %d, op_errno = %d", + local->loc.path, prev->name, op_ret, op_errno); + + /* The first successful lookup*/ + if (!op_ret && gf_uuid_is_null(local->gfid)) { + memcpy(local->gfid, stbuf->ia_gfid, 16); + } + if (!gf_uuid_is_null(local->gfid)) { + gf_uuid_unparse(local->gfid, gfid_local); + } + + /* Check if the gfid is different for file from other node */ + if (!op_ret && gf_uuid_compare(local->gfid, stbuf->ia_gfid)) { + gf_uuid_unparse(stbuf->ia_gfid, gfid_node); + + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH, + "%s: gfid different on %s." + " gfid local = %s, gfid subvol = %s", + local->loc.path, prev->name, gfid_local, gfid_node); + } + + LOCK(&frame->lock); + { + /* TODO: assert equal mode on stbuf->st_mode and + local->stbuf->st_mode + else mkdir/chmod/chown and fix + */ + ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr); + + if (op_ret == -1) { + local->op_errno = op_errno; - if (local->layout) { - dht_layout_unref (this, local->layout); - local->layout = NULL; + /* The GFID is missing on this subvol. Force a heal. */ + if (op_errno == ENODATA) { + local->need_lookup_everywhere = 1; + } + goto unlock; } - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set the dict entry for dht"); + is_dir = check_is_dir(inode, stbuf, xattr); + if (!is_dir) { + gf_msg_debug(this->name, 0, + "%s: lookup on %s returned non dir 0%o" + "calling lookup_everywhere", + local->loc.path, prev->name, stbuf->ia_type); - call_cnt = local->call_cnt = conf->subvolume_cnt; + local->need_lookup_everywhere = 1; + goto unlock; + } - local->layout = dht_layout_new (this, - conf->subvolume_cnt); - if (!local->layout) { - local->op_errno = ENOMEM; - goto err; + local->op_ret = 0; + if (local->xattr == NULL) { + local->xattr = dict_ref(xattr); + } else { + dht_aggregate_xattr(local->xattr, xattr); + } + + if (__is_root_gfid(stbuf->ia_gfid)) { + ret = dht_dir_has_layout(xattr, conf->xattr_name); + if (ret >= 0) { + if (is_greater_time(local->prebuf.ia_ctime, + local->prebuf.ia_ctime_nsec, + stbuf->ia_ctime, stbuf->ia_ctime_nsec)) { + /* Choose source */ + local->prebuf.ia_gid = stbuf->ia_gid; + local->prebuf.ia_uid = stbuf->ia_uid; + + local->prebuf.ia_ctime = stbuf->ia_ctime; + local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec; + local->prebuf.ia_prot = stbuf->ia_prot; + } + } } - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_root_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - &local->loc, local->xattr_req); + if (local->stbuf.ia_type != IA_INVAL) { + /* This is not the first subvol to respond + * Compare values to see if attrs need to be healed + */ + if ((local->stbuf.ia_gid != stbuf->ia_gid) || + (local->stbuf.ia_uid != stbuf->ia_uid) || + (is_permission_different(&local->stbuf.ia_prot, + &stbuf->ia_prot))) { + local->need_attrheal = 1; + } } - return 0; -err: - DHT_STACK_UNWIND (lookup, frame, -1, local->op_errno, - local->inode, &local->stbuf, local->xattr, - &local->postparent); + if (local->inode == NULL) + local->inode = inode_ref(inode); - return 0; + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->postparent, postparent); + + if (!dict_get(xattr, conf->mds_xattr_key)) { + gf_msg_debug(this->name, 0, + "%s: mds xattr %s is not present " + "on %s(gfid = %s)", + local->loc.path, conf->mds_xattr_key, prev->name, + gfid_local); + goto unlock; + } + + /* Save the mds subvol info and stbuf. This is the value that will + * be used for healing + */ + local->mds_subvol = prev; + local->mds_stbuf = *stbuf; + + /* Save mds subvol on inode ctx */ + + ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "%s: Failed to set mds (%s)", local->loc.path, prev->name); + } + check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key, + mds_xattr_val, 1, &errst); + if ((check_mds < 0) && !errst) { + /* Check if xattrs need to be healed on the directories */ + local->mds_xattr = dict_ref(xattr); + gf_msg_debug(this->name, 0, + "%s: %s is not zero on %s. Xattrs need to be healed." + "(gfid = %s)", + local->loc.path, conf->mds_xattr_key, prev->name, + gfid_local); + local->need_xattr_heal = 1; + } + } + +unlock: + UNLOCK(&frame->lock); + + this_call_cnt = dht_frame_return(frame); + + if (is_last_call(this_call_cnt)) { + /* If the mds subvol is not set correctly*/ + if (!__is_root_gfid(local->gfid) && + (!dict_get(local->xattr, conf->mds_xattr_key))) { + local->need_selfheal = 1; + } + + /* No need to call xattr heal code if volume count is 1 + */ + if (conf->subvolume_cnt == 1) { + local->need_xattr_heal = 0; + } + + if (local->need_selfheal || local->need_lookup_everywhere) { + /* Set the gfid-req so posix will set the GFID*/ + if (!gf_uuid_is_null(local->gfid)) { + /* Ok, this should _never_ happen */ + ret = dict_set_static_bin(local->xattr_req, "gfid-req", + local->gfid, 16); + } else { + if (!gf_uuid_is_null(local->gfid_req)) + ret = dict_set_static_bin(local->xattr_req, "gfid-req", + local->gfid_req, 16); + } + } + + if (local->need_lookup_everywhere) { + local->need_lookup_everywhere = 0; + dht_lookup_everywhere(frame, this, &local->loc); + return 0; + } + + if (local->op_ret == 0) { + if (dht_needs_selfheal(frame, this)) { + goto selfheal; + } + + dht_layout_set(this, local->inode, layout); + if (local->inode) { + dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1); + } + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); + } + } + + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + /* Delete mds xattr at the time of STACK UNWIND */ + if (local->xattr) + GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr); + + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + } + + return 0; + +selfheal: + FRAME_SU_DO(frame, dht_local_t); + ret = dht_selfheal_directory(frame, dht_lookup_selfheal_cbk, &local->loc, + layout); +out: + return ret; +} + +static int +dht_lookup_directory(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + int call_cnt = 0; + int i = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int ret = 0; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, unwind); + GF_VALIDATE_OR_GOTO("dht", frame->local, unwind); + GF_VALIDATE_OR_GOTO("dht", this->private, unwind); + GF_VALIDATE_OR_GOTO("dht", loc, unwind); + + conf = this->private; + local = frame->local; + + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new(this, conf->subvolume_cnt); + if (!local->layout) { + goto unwind; + } + + if (local->xattr != NULL) { + dict_unref(local->xattr); + local->xattr = NULL; + } + + if (!gf_uuid_is_null(local->gfid)) { + /* use this gfid in order to heal any missing ones */ + ret = dict_set_gfuuid(local->xattr_req, "gfid-req", local->gfid, true); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "%s: Failed to set dictionary value:" + " key = gfid-req", + local->loc.path); + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE( + frame, dht_lookup_dir_cbk, conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, local->xattr_req); + } + return 0; unwind: - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL); out: - return -1; + return 0; } int -dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - int is_dir = 0; - int is_linkfile = 0; - unsigned char root_gfid[16] = {0,}; - - GF_VALIDATE_OR_GOTO ("dht", frame, err); - GF_VALIDATE_OR_GOTO ("dht", this, err); - GF_VALIDATE_OR_GOTO ("dht", frame->local, err); - GF_VALIDATE_OR_GOTO ("dht", cookie, err); +dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int is_dir = 0; + int is_linkfile = 0; + int follow_link = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + uint32_t vol_commit_hash = 0; + xlator_t *subvol = NULL; + int32_t check_mds = 0; + int errst = 0, i = 0; + int32_t mds_xattr_val[1] = {0}; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, err); + GF_VALIDATE_OR_GOTO("dht", frame->local, err); + GF_VALIDATE_OR_GOTO("dht", cookie, err); + GF_VALIDATE_OR_GOTO("dht", this->private, err); + + local = frame->local; + prev = cookie; + conf = this->private; + + if (!conf->vch_forced) { + /* Update the commithash value if available + */ + ret = dict_get_uint32(xattr, conf->commithash_xattr_name, + &vol_commit_hash); + if (ret == 0) { + conf->vol_commit_hash = vol_commit_hash; + } + } - local = frame->local; - prev = cookie; - conf = this->private; - if (!conf) - goto out; + gf_uuid_unparse(local->loc.gfid, gfid); + + gf_msg_debug(this->name, op_errno, + "%s: revalidate lookup on %s returned op_ret %d", + local->loc.path, prev->name, op_ret); - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - - if ((op_errno != ENOTCONN) - && (op_errno != ENOENT) - && (op_errno != ESTALE)) { - gf_log (this->name, GF_LOG_INFO, - "subvolume %s for %s returned -1 (%s)", - prev->this->name, local->loc.path, - strerror (op_errno)); - } - if (op_errno == ESTALE) { - /* propogate the ESTALE to parent. - * setting local->layout_mismatch would send - * ESTALE to parent. */ - local->layout_mismatch = 1; + LOCK(&frame->lock); + { + if (gf_uuid_is_null(local->gfid)) { + memcpy(local->gfid, local->loc.gfid, 16); + } + + if (op_ret == -1) { + local->op_errno = op_errno; + + if ((op_errno != ENOTCONN) && (op_errno != ENOENT) && + (op_errno != ESTALE)) { + gf_msg(this->name, GF_LOG_INFO, op_errno, + DHT_MSG_REVALIDATE_CBK_INFO, + "Revalidate: subvolume %s for %s " + "(gfid = %s) returned -1", + prev->name, local->loc.path, gfid); + } + if (op_errno == ESTALE) { + /* propagate the ESTALE to parent. + * setting local->return_estale would send + * ESTALE to parent. */ + local->return_estale = 1; + } + + /* if it is ENOENT, we may have to do a + * 'lookup_everywhere()' to make sure + * the file is not migrated */ + if (op_errno == ENOENT) { + if (IA_ISREG(local->loc.inode->ia_type)) { + gf_msg_debug(this->name, 0, + "found ENOENT for %s. " + "Setting " + "need_lookup_everywhere" + " flag to 1", + local->loc.path); + + local->need_lookup_everywhere = 1; + } else if (IA_ISDIR(local->loc.inode->ia_type)) { + layout = local->layout; + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == prev) { + layout->list[i].err = op_errno; + break; } + } - goto unlock; + local->need_selfheal = 1; } + } - if (stbuf->ia_type != local->inode->ia_type) { - gf_log (this->name, GF_LOG_INFO, - "mismatching filetypes 0%o v/s 0%o for %s", - (stbuf->ia_type), (local->inode->ia_type), - local->loc.path); + /* The GFID is missing on this subvol. Lookup everywhere to force a + * gfid heal + */ + if ((op_errno == ENODATA) && + (IA_ISDIR(local->loc.inode->ia_type))) { + local->need_lookup_everywhere = 1; + } - local->op_ret = -1; - local->op_errno = EINVAL; + goto unlock; + } - goto unlock; - } + if ((!IA_ISINVAL(local->inode->ia_type)) && + stbuf->ia_type != local->inode->ia_type) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FILE_TYPE_MISMATCH, + "mismatching filetypes 0%o v/s 0%o for %s," + " gfid = %s", + (stbuf->ia_type), (local->inode->ia_type), local->loc.path, + gfid); - layout = local->layout; + local->op_ret = -1; + local->op_errno = EINVAL; - is_dir = check_is_dir (inode, stbuf, xattr); - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + goto unlock; + } - if (is_linkfile) { - gf_log (this->name, GF_LOG_INFO, - "linkfile found in revalidate for %s", - local->loc.path); - local->layout_mismatch = 1; + layout = local->layout; - goto unlock; + is_dir = check_is_dir(inode, stbuf, xattr); + is_linkfile = check_is_linkfile(inode, stbuf, xattr, + conf->link_xattr_name); + if (is_linkfile) { + follow_link = 1; + goto unlock; + } + if (is_dir) { + ret = dht_dir_has_layout(xattr, conf->xattr_name); + if (ret >= 0) { + if (is_greater_time(local->prebuf.ia_ctime, + local->prebuf.ia_ctime_nsec, + stbuf->ia_ctime, stbuf->ia_ctime_nsec)) { + /* Choose source */ + local->prebuf.ia_gid = stbuf->ia_gid; + local->prebuf.ia_uid = stbuf->ia_uid; + + local->prebuf.ia_ctime = stbuf->ia_ctime; + local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec; + + if (__is_root_gfid(stbuf->ia_gfid)) + local->prebuf.ia_prot = stbuf->ia_prot; + } + } + + if (local->stbuf.ia_type != IA_INVAL) { + if ((local->stbuf.ia_gid != stbuf->ia_gid) || + (local->stbuf.ia_uid != stbuf->ia_uid) || + is_permission_different(&local->stbuf.ia_prot, + &stbuf->ia_prot)) { + local->need_attrheal = 1; + } + } + + if (!dict_get(xattr, conf->mds_xattr_key)) { + gf_msg_debug(this->name, 0, + "%s: internal xattr %s is not present" + " on subvol %s(gfid is %s)", + local->loc.path, conf->mds_xattr_key, prev->name, + gfid); + } else { + check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key, + mds_xattr_val, 1, &errst); + local->mds_subvol = prev; + local->mds_stbuf.ia_gid = stbuf->ia_gid; + local->mds_stbuf.ia_uid = stbuf->ia_uid; + local->mds_stbuf.ia_prot = stbuf->ia_prot; + + /* save mds subvol on inode ctx */ + ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_SET_INODE_CTX_FAILED, + "Failed to set MDS subvol for %s vol is %s", + local->loc.path, prev->name); + } + if ((check_mds < 0) && !errst) { + /* Check if xattrs need to be healed on the directory + */ + local->mds_xattr = dict_ref(xattr); + gf_msg_debug(this->name, 0, + "Value of %s is not zero on " + "hashed subvol so xattr needs to" + " be healed on non hashed" + " path is %s and vol name is %s " + " gfid is %s", + conf->mds_xattr_key, local->loc.path, + prev->name, gfid); + local->need_xattr_heal = 1; } + } + ret = dht_layout_dir_mismatch(this, layout, prev, &local->loc, + xattr); + if (ret != 0) { + /* In memory layout does not match on-disk layout. + */ + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_MISMATCH, + "Mismatching layouts for %s, gfid = %s", local->loc.path, + gfid); - if (is_dir) { - ret = dht_layout_dir_mismatch (this, layout, - prev->this, &local->loc, - xattr); - if (ret != 0) { - gf_log (this->name, GF_LOG_INFO, - "mismatching layouts for %s", - local->loc.path); + local->layout_mismatch = 1; - local->layout_mismatch = 1; + goto unlock; + } + } - goto unlock; - } - } + gf_uuid_copy(local->stbuf.ia_gfid, stbuf->ia_gfid); + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->postparent, postparent); - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->postparent, postparent, - prev->this); + local->op_ret = 0; - local->op_ret = 0; - local->stbuf.ia_ino = local->ia_ino; + if (!local->xattr) { + local->xattr = dict_ref(xattr); + } else if (is_dir) { + dht_aggregate_xattr(local->xattr, xattr); + } + } +unlock: + UNLOCK(&frame->lock); - if (local->loc.parent) - local->postparent.ia_ino = local->loc.parent->ino; + if (follow_link) { + /* Found a linkto file. Follow it to see if the target file exists + */ + gf_uuid_copy(local->gfid, stbuf->ia_gfid); - if (!local->xattr) { - local->xattr = dict_ref (xattr); - } else if (is_dir) { - dht_aggregate_xattr (local->xattr, xattr); - } + subvol = dht_linkfile_subvol(this, inode, stbuf, xattr); + if (!subvol) { + op_errno = ESTALE; + local->op_ret = -1; + } else { + STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol, + subvol->fops->lookup, &local->loc, + local->xattr_req); + return 0; } -unlock: - UNLOCK (&frame->lock); -out: - this_call_cnt = dht_frame_return (frame); - - if (is_last_call (this_call_cnt)) { - if (!IA_ISDIR (local->stbuf.ia_type) - && (local->hashed_subvol != local->cached_subvol) - && (local->stbuf.ia_nlink == 1) - && (conf && conf->unhashed_sticky_bit)) { - local->stbuf.ia_prot.sticky = 1; + } + + this_call_cnt = dht_frame_return(frame); + + if (is_last_call(this_call_cnt)) { + if (!IA_ISDIR(local->stbuf.ia_type) && + (local->hashed_subvol != local->cached_subvol) && + (local->stbuf.ia_nlink == 1) && + (conf && conf->unhashed_sticky_bit)) { + local->stbuf.ia_prot.sticky = 1; + } + /* No need to call heal code if volume count is 1 + */ + if (conf->subvolume_cnt == 1) + local->need_xattr_heal = 0; + + if (IA_ISDIR(local->stbuf.ia_type)) { + /* No mds xattr found. Trigger a heal to set it */ + if (!__is_root_gfid(local->loc.inode->gfid) && + (!dict_get(local->xattr, conf->mds_xattr_key))) + local->need_selfheal = 1; + + if (dht_needs_selfheal(frame, this)) { + if (!__is_root_gfid(local->loc.inode->gfid)) { + if (local->mds_subvol) { + local->stbuf.ia_gid = local->mds_stbuf.ia_gid; + local->stbuf.ia_uid = local->mds_stbuf.ia_uid; + local->stbuf.ia_prot = local->mds_stbuf.ia_prot; + } + } else { + local->stbuf.ia_gid = local->prebuf.ia_gid; + local->stbuf.ia_uid = local->prebuf.ia_uid; + local->stbuf.ia_prot = local->prebuf.ia_prot; } - if (local->layout_mismatch) { - local->op_ret = -1; - local->op_errno = ESTALE; - - /* Because for 'root' inode, there is no FRESH lookup - * sent from FUSE layer upon ESTALE, we need to handle - * that one case here */ - root_gfid[15] = 1; - if (!local->loc.parent && - !uuid_compare (local->loc.inode->gfid, root_gfid)) { - dht_do_fresh_lookup_on_root (this, frame); - return 0; - } + layout = local->layout; + dht_selfheal_directory(frame, dht_lookup_selfheal_cbk, + &local->loc, layout); + return 0; + } + } + + if (local->layout_mismatch) { + /* Found layout mismatch in the directory, need to + fix this in the inode context */ + dht_layout_unref(this, local->layout); + local->layout = NULL; + dht_lookup_directory(frame, this, &local->loc); + return 0; + } + + if (local->need_lookup_everywhere) { + /* As the current layout gave ENOENT error, we would + need a new layout */ + dht_layout_unref(this, local->layout); + local->layout = NULL; + + /* We know that current cached subvol is no longer + valid, get the new one */ + local->cached_subvol = NULL; + if (local->xattr_req) { + if (!gf_uuid_is_null(local->gfid)) { + ret = dict_set_static_bin(local->xattr_req, "gfid-req", + local->gfid, 16); } + } - WIPE (&local->postparent); + dht_lookup_everywhere(frame, this, &local->loc); + return 0; + } + if (local->return_estale) { + local->op_ret = -1; + local->op_errno = ESTALE; + } - DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, local->xattr, - &local->postparent); + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); } -err: - return ret; -} + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + /* local->stbuf is updated only from subvols which have a layout + * The reason is to avoid choosing attr heal source from newly + * added bricks. In case e.g we have only one subvol and for + * some reason layout is not present on it, then local->stbuf + * will be EINVAL. This is an indication that the subvols + * active in the cluster do not have layouts on disk. + * Unwind with ESTALE to trigger a fresh lookup */ + if (is_dir && local->stbuf.ia_type == IA_INVAL) { + local->op_ret = -1; + local->op_errno = ESTALE; + } + /* Delete mds xattr at the time of STACK UNWIND */ + if (local->xattr) + GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr); -int -dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) -{ - dht_local_t *local = NULL; - xlator_t *cached_subvol = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", frame, out); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", this->private, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + } - local = frame->local; - cached_subvol = local->cached_subvol; - conf = this->private; +err: + return ret; +} - ret = dht_layout_preset (this, local->cached_subvol, inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to set layout for subvolume %s", - cached_subvol ? cached_subvol->name : "<nil>"); - local->op_ret = -1; - local->op_errno = EINVAL; - goto unwind; - } +static int +dht_lookup_linkfile_create_cbk(call_frame_t *frame, void *cooie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *cached_subvol = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", this->private, out); + + local = frame->local; + cached_subvol = local->cached_subvol; + conf = this->private; + + gf_uuid_unparse(local->loc.gfid, gfid); + + if (local->locked) + dht_unlock_namespace(frame, &local->lock[0]); + + ret = dht_layout_preset(this, local->cached_subvol, local->loc.inode); + if (ret < 0) { + gf_msg_debug(this->name, EINVAL, + "Failed to set layout for subvolume %s, " + "(gfid = %s)", + cached_subvol ? cached_subvol->name : "<nil>", gfid); + local->op_ret = -1; + local->op_errno = EINVAL; + goto unwind; + } + + local->op_ret = 0; + if ((local->stbuf.ia_nlink == 1) && (conf && conf->unhashed_sticky_bit)) { + local->stbuf.ia_prot.sticky = 1; + } + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1); + } - local->op_ret = 0; - if ((local->stbuf.ia_nlink == 1) - && (conf && conf->unhashed_sticky_bit)) { - local->stbuf.ia_prot.sticky = 1; - } +unwind: + gf_msg_debug(this->name, 0, + "creation of linkto on hashed subvol:%s, " + "returned with op_ret %d and op_errno %d: %s", + local->hashed_subvol->name, op_ret, op_errno, + uuid_utoa(local->loc.gfid)); - if (local->loc.parent) - local->postparent.ia_ino = local->loc.parent->ino; + if (local->linked == _gf_true) + dht_linkfile_attr_heal(frame, this); -unwind: - WIPE (&local->postparent); + dht_set_fixed_dir_stat(&local->postparent); - DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, local->xattr, - &local->postparent); + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); out: - return ret; + return ret; } +static int +dht_lookup_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int this_call_cnt = 0; + dht_local_t *local = NULL; + const char *path = NULL; -int -dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - int is_linkfile = 0; - int is_dir = 0; - xlator_t *subvol = NULL; - loc_t *loc = NULL; - xlator_t *link_subvol = NULL; - xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", frame, out); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); - GF_VALIDATE_OR_GOTO ("dht", this->private, out); - - conf = this->private; - - local = frame->local; - loc = &local->loc; - - prev = cookie; - subvol = prev->this; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - if (op_errno != ENOENT) - local->op_errno = op_errno; - goto unlock; - } - if (uuid_is_null (local->gfid)) - memcpy (local->gfid, buf->ia_gfid, 16); - - is_linkfile = check_is_linkfile (inode, buf, xattr); - is_dir = check_is_dir (inode, buf, xattr); - - if (is_linkfile) { - link_subvol = dht_linkfile_subvol (this, inode, buf, - xattr); - gf_log (this->name, GF_LOG_DEBUG, - "found on %s linkfile %s (-> %s)", - subvol->name, loc->path, - link_subvol ? link_subvol->name : "''"); - goto unlock; - } + local = (dht_local_t *)frame->local; + path = local->loc.path; + FRAME_SU_UNDO(frame, dht_local_t); - if (is_dir) { - local->dir_count++; + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO, + "lookup_unlink returned with " + "op_ret -> %d and op-errno -> %d for %s", + op_ret, op_errno, ((path == NULL) ? "null" : path)); - gf_log (this->name, GF_LOG_DEBUG, - "found on %s directory %s", - subvol->name, loc->path); - } else { - local->file_count++; - - if (!local->cached_subvol) { - /* found one file */ - dht_iatt_merge (this, &local->stbuf, buf, - subvol); - local->xattr = dict_ref (xattr); - local->cached_subvol = subvol; - gf_log (this->name, GF_LOG_DEBUG, - "found on %s file %s", - subvol->name, loc->path); - - dht_iatt_merge (this, &local->postparent, - postparent, subvol); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "multiple subvolumes (%s and %s) have " - "file %s", local->cached_subvol->name, - subvol->name, local->loc.path); - } - } - } -unlock: - UNLOCK (&frame->lock); + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + dht_lookup_everywhere_done(frame, this); + } - if (is_linkfile) { - gf_log (this->name, GF_LOG_DEBUG, - "deleting stale linkfile %s on %s", - loc->path, subvol->name); - dht_linkfile_unlink (frame, this, subvol, loc); - } - - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - hashed_subvol = local->hashed_subvol; - cached_subvol = local->cached_subvol; - - if (local->file_count && local->dir_count) { - gf_log (this->name, GF_LOG_ERROR, - "path %s exists as a file on one subvolume " - "and directory on another. " - "Please fix it manually", - loc->path); - DHT_STACK_UNWIND (lookup, frame, -1, EIO, NULL, NULL, NULL, - NULL); - return 0; - } + return 0; +} - if (local->dir_count) { - dht_lookup_directory (frame, this, &local->loc); - return 0; - } +static int +dht_lookup_unlink_of_false_linkto_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int this_call_cnt = 0; + dht_local_t *local = NULL; + const char *path = NULL; - if (!cached_subvol) { - DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL, - NULL); - return 0; - } + local = (dht_local_t *)frame->local; + path = local->loc.path; - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "cannot create linkfile file for %s on %s: " - "hashed subvolume cannot be found.", - loc->path, cached_subvol->name); - - local->op_ret = 0; - local->op_errno = 0; - - ret = dht_layout_preset (frame->this, cached_subvol, - local->inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to set layout for subvol %s", - cached_subvol ? cached_subvol->name : - "<nil>"); - local->op_ret = -1; - local->op_errno = EINVAL; - } + FRAME_SU_UNDO(frame, dht_local_t); - if (local->loc.parent) - local->postparent.ia_ino = - local->loc.parent->ino; + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO, + "lookup_unlink returned with " + "op_ret -> %d and op-errno -> %d for %s", + op_ret, op_errno, ((path == NULL) ? "null" : path)); - WIPE (&local->postparent); + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + if ((op_ret == 0) || ((op_errno != EBUSY) && (op_errno != ENOTCONN))) { + dht_lookup_everywhere_done(frame, this); + } else { + /*When dht_lookup_everywhere is performed, one cached + *and one hashed file was found and hashed file does + *not point to the above mentioned cached node. So it + *was considered as stale and an unlink was performed. + *But unlink fails. So may be rebalance is in progress. + *now ideally we have two data-files. One obtained during + *lookup_everywhere and one where unlink-failed. So + *at this point in time we cannot decide which one to + *choose because there are chances of first cached + *file is truncated after rebalance and if it is chosen + *as cached node, application will fail. So return EIO.*/ + + if (op_errno == EBUSY) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_UNLINK_FAILED, + "Could not unlink the linkto file as " + "either fd is open and/or linkto xattr " + "is set for %s", + ((path == NULL) ? "null" : path)); + } + DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL); + } + } + + return 0; +} - DHT_STACK_UNWIND (lookup, frame, local->op_ret, - local->op_errno, local->inode, - &local->stbuf, local->xattr, - &local->postparent); - return 0; - } +static int +dht_lookup_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + const char *path = NULL; - gf_log (this->name, GF_LOG_DEBUG, - "linking file %s existing on %s to %s (hash)", - loc->path, cached_subvol->name, - hashed_subvol->name); + /* NOTE: + * If stale file unlink fails either there is an open-fd or is not an + * dht-linkto-file then posix_unlink returns EBUSY, which is overwritten + * to ENOENT + */ - ret = dht_linkfile_create (frame, - dht_lookup_linkfile_create_cbk, - cached_subvol, hashed_subvol, loc); - } + local = frame->local; -out: - return ret; -} + if (local) { + FRAME_SU_UNDO(frame, dht_local_t); + if (local->loc.path) + path = local->loc.path; + } + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO, + "Returned with op_ret %d and " + "op_errno %d for %s", + op_ret, op_errno, ((path == NULL) ? "null" : path)); -int -dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc) -{ - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - int i = 0; - int call_cnt = 0; + DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL); - GF_VALIDATE_OR_GOTO ("dht", frame, err); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", this->private, out); - GF_VALIDATE_OR_GOTO ("dht", loc, out); + return 0; +} - conf = this->private; - local = frame->local; +static int +dht_fill_dict_to_avoid_unlink_of_migrating_file(dict_t *dict) +{ + int ret = 0; - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; + ret = dict_set_int32_sizen(dict, DHT_SKIP_NON_LINKTO_UNLINK, 1); - if (!local->inode) - local->inode = inode_ref (loc->inode); + if (ret) + return -1; - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_everywhere_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - loc, local->xattr_req); - } + ret = dict_set_int32_sizen(dict, DHT_SKIP_OPEN_FD_UNLINK, 1); - return 0; -out: - DHT_STACK_UNWIND (lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL); -err: + if (ret) return -1; -} + return 0; +} -int -dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) -{ - call_frame_t *prev = NULL; - dht_local_t *local = NULL; - xlator_t *subvol = NULL; - loc_t *loc = NULL; - dht_conf_t *conf = NULL; - int ret = 0; - - GF_VALIDATE_OR_GOTO ("dht", frame, out); - GF_VALIDATE_OR_GOTO ("dht", this, unwind); - GF_VALIDATE_OR_GOTO ("dht", frame->local, unwind); - GF_VALIDATE_OR_GOTO ("dht", this->private, unwind); - GF_VALIDATE_OR_GOTO ("dht", cookie, unwind); - - prev = cookie; - subvol = prev->this; - conf = this->private; - local = frame->local; - loc = &local->loc; +static int32_t +dht_linkfile_create_lookup_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + dht_local_t *local = NULL; + int call_cnt = 0, ret = 0; + xlator_t *subvol = NULL; + uuid_t gfid = { + 0, + }; + char gfid_str[GF_UUID_BUF_SIZE] = {0}; + + subvol = cookie; + local = frame->local; + + if (subvol == local->hashed_subvol) { + if ((op_ret == 0) || (op_errno != ENOENT)) + local->dont_create_linkto = _gf_true; + } else { + if (gf_uuid_is_null(local->gfid)) + gf_uuid_copy(gfid, local->loc.gfid); + else + gf_uuid_copy(gfid, local->gfid); + + if ((op_ret == 0) && gf_uuid_compare(gfid, buf->ia_gfid)) { + gf_uuid_unparse(gfid, gfid_str); + gf_msg_debug(this->name, 0, + "gfid (%s) different on cached subvol " + "(%s) and looked up inode (%s), not " + "creating linkto", + uuid_utoa(buf->ia_gfid), subvol->name, gfid_str); + local->dont_create_linkto = _gf_true; + } else if (op_ret == -1) { + local->dont_create_linkto = _gf_true; + } + } + + call_cnt = dht_frame_return(frame); + if (is_last_call(call_cnt)) { + if (local->dont_create_linkto) + goto no_linkto; + else { + gf_msg_debug(this->name, 0, + "Creating linkto file on %s(hash) to " + "%s on %s (gfid = %s)", + local->hashed_subvol->name, local->loc.path, + local->cached_subvol->name, gfid_str); + + ret = dht_linkfile_create(frame, dht_lookup_linkfile_create_cbk, + this, local->cached_subvol, + local->hashed_subvol, &local->loc); + + if (ret < 0) + goto no_linkto; + } + } + + return 0; + +no_linkto: + gf_msg_debug(this->name, 0, + "skipped linkto creation (path:%s) (gfid:%s) " + "(hashed-subvol:%s) (cached-subvol:%s)", + local->loc.path, gfid_str, local->hashed_subvol->name, + local->cached_subvol->name); + + dht_lookup_linkfile_create_cbk(frame, NULL, this, 0, 0, local->loc.inode, + &local->stbuf, &local->preparent, + &local->postparent, local->xattr); + return 0; +} - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "lookup of %s on %s (following linkfile) failed (%s)", - local->loc.path, subvol->name, strerror (op_errno)); - goto err; - } +static int32_t +dht_call_lookup_linkfile_create(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + int i = 0; + xlator_t *subvol = NULL; - if (check_is_dir (inode, stbuf, xattr)) { - gf_log (this->name, GF_LOG_INFO, - "lookup of %s on %s (following linkfile) reached dir", - local->loc.path, subvol->name); - goto err; - } + local = frame->local; + if (gf_uuid_is_null(local->gfid)) + gf_uuid_unparse(local->loc.gfid, gfid); + else + gf_uuid_unparse(local->gfid, gfid); - if (check_is_linkfile (inode, stbuf, xattr)) { - gf_log (this->name, GF_LOG_INFO, - "lookup of %s on %s (following linkfile) reached link", - local->loc.path, subvol->name); - goto err; - } + if (op_ret < 0) { + gf_log(this->name, GF_LOG_WARNING, + "protecting namespace failed, skipping linkto " + "creation (path:%s)(gfid:%s)(hashed-subvol:%s)" + "(cached-subvol:%s)", + local->loc.path, gfid, local->hashed_subvol->name, + local->cached_subvol->name); + goto err; + } - if ((stbuf->ia_nlink == 1) - && (conf && conf->unhashed_sticky_bit)) { - stbuf->ia_prot.sticky = 1; - } - dht_itransform (this, prev->this, stbuf->ia_ino, &stbuf->ia_ino); - if (local->loc.parent) - postparent->ia_ino = local->loc.parent->ino; + local->locked = _gf_true; - ret = dht_layout_preset (this, prev->this, inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "failed to set layout for subvolume %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - } + local->call_cnt = 2; -unwind: - WIPE (postparent); + for (i = 0; i < 2; i++) { + subvol = (subvol == NULL) ? local->hashed_subvol : local->cached_subvol; - DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr, - postparent); + STACK_WIND_COOKIE(frame, dht_linkfile_create_lookup_cbk, subvol, subvol, + subvol->fops->lookup, &local->loc, NULL); + } - return 0; + return 0; err: - dht_lookup_everywhere (frame, this, loc); -out: - return 0; + dht_lookup_linkfile_create_cbk(frame, NULL, this, 0, 0, local->loc.inode, + &local->stbuf, &local->preparent, + &local->postparent, local->xattr); + return 0; } +/* Rebalance is performed from cached_node to hashed_node. Initial cached_node + * contains a non-linkto file. After migration it is converted to linkto and + * then unlinked. And at hashed_subvolume, first a linkto file is present, + * then after migration it is converted to a non-linkto file. + * + * Lets assume a file is present on cached subvolume and a new brick is added + * and new brick is the new_hashed subvolume. So fresh lookup on newly added + * hashed subvolume will fail and dht_lookup_everywhere gets called. If just + * before sending the dht_lookup_everywhere request rebalance is in progress, + * + * from cached subvolume it may see: Nonlinkto or linkto or No file + * from hashed subvolume it may see: No file or linkto file or non-linkto file + * + * So this boils down to 9 cases: + * at cached_subvol at hashed_subvol + * ---------------- ----------------- + * + *a) No file No file + * [request reached after [Request reached before + * migration] Migration] + * + *b) No file Linkto File + * + *c) No file Non-Linkto File + * + *d) Linkto No-File + * + *e) Linkto Linkto + * + *f) Linkto Non-Linkto + * + *g) NonLinkto No-File + * + *h) NonLinkto Linkto + * + *i) NonLinkto NonLinkto + * + * dht_lookup_everywhere_done takes decision based on any of the above case + */ -int -dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc) +static int +dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this) { - int call_cnt = 0; - int i = 0; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - - GF_VALIDATE_OR_GOTO ("dht", frame, out); - GF_VALIDATE_OR_GOTO ("dht", this, unwind); - GF_VALIDATE_OR_GOTO ("dht", frame->local, unwind); - GF_VALIDATE_OR_GOTO ("dht", this->private, unwind); - GF_VALIDATE_OR_GOTO ("dht", loc, unwind); + int ret = 0; + dht_local_t *local = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + dht_layout_t *layout = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + gf_boolean_t found_non_linkto_on_hashed = _gf_false; + + local = frame->local; + hashed_subvol = local->hashed_subvol; + cached_subvol = local->cached_subvol; + + gf_uuid_unparse(local->loc.gfid, gfid); + + if (local->file_count && local->dir_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_FILE_TYPE_MISMATCH, + "path %s (gfid = %s)exists as a file on one " + "subvolume and directory on another. " + "Please fix it manually", + local->loc.path, gfid); + DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL); + return 0; + } + if (local->op_ret && local->gfid_missing) { + if (gf_uuid_is_null(local->gfid_req)) { + DHT_STACK_UNWIND(lookup, frame, -1, ENODATA, NULL, NULL, NULL, + NULL); + return 0; + } + /* A hack */ + dht_lookup_directory(frame, this, &local->loc); + return 0; + } + + if (local->dir_count) { + dht_lookup_directory(frame, this, &local->loc); + return 0; + } + + gf_msg_debug(this->name, 0, + "STATUS: hashed_subvol %s " + "cached_subvol %s", + (hashed_subvol == NULL) ? "null" : hashed_subvol->name, + (cached_subvol == NULL) ? "null" : cached_subvol->name); + + if (!cached_subvol) { + if (local->skip_unlink.handle_valid_link && hashed_subvol) { + /*Purpose of "DHT_SKIP_NON_LINKTO_UNLINK": + * If this lookup is performed by rebalance and this + * rebalance process detected hashed file and by + * the time it sends the lookup request to cached node, + * file got migrated and now at initial hashed_node, + * final migrated file is present. With current logic, + * because this process fails to find the cached_node, + * it will unlink the file at initial hashed_node. + * + * So we avoid this by setting key, and checking at the + * posix_unlink that unlink the file only if file is a + * linkto file and not a migrated_file. + */ + + ret = dht_fill_dict_to_avoid_unlink_of_migrating_file( + local->xattr_req); + + if (ret) { + /* If for some reason, setting key in the dict + * fails, return with ENOENT, as with respect to + * this process, it detected only a stale link + * file. + * + * Next lookup will delete it. + * + * Performing deletion of stale link file when + * setting key in dict fails, may cause the data + * loss because of the above mentioned race. + */ - conf = this->private; - local = frame->local; + DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, + NULL); + } else { + local->skip_unlink.handle_valid_link = _gf_false; + + gf_msg_debug(this->name, 0, + "No Cached was found and " + "unlink on hashed was skipped" + " so performing now: %s", + local->loc.path); + FRAME_SU_DO(frame, dht_local_t); + STACK_WIND(frame, dht_lookup_unlink_stale_linkto_cbk, + hashed_subvol, hashed_subvol->fops->unlink, + &local->loc, 0, local->xattr_req); + } - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; + } else { + gf_msg_debug(this->name, 0, + "There was no cached file and " + "unlink on hashed is not skipped %s", + local->loc.path); - local->layout = dht_layout_new (this, conf->subvolume_cnt); - if (!local->layout) { - goto unwind; + DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL); } + return 0; + } - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - &local->loc, local->xattr_req); + /* At the time of dht_lookup, no file was found on hashed and that is + * why dht_lookup_everywhere is called, but by the time + * dht_lookup_everywhere + * reached to server, file might have already migrated. In that case we + * will find a migrated file at the hashed_node. In this case store the + * layout in context and return successfully. + */ + + if (hashed_subvol || local->need_lookup_everywhere) { + if (local->need_lookup_everywhere) { + found_non_linkto_on_hashed = _gf_true; + + } else if ((local->file_count == 1) && + (hashed_subvol == cached_subvol)) { + gf_msg_debug(this->name, 0, + "found cached file on hashed subvolume " + "so store in context and return for %s", + local->loc.path); + + found_non_linkto_on_hashed = _gf_true; } - return 0; -unwind: - DHT_STACK_UNWIND (lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL); -out: - return 0; -} + if (found_non_linkto_on_hashed) + goto preset_layout; + } + if (hashed_subvol) { + if (local->skip_unlink.handle_valid_link == _gf_true) { + if (cached_subvol == local->skip_unlink.hash_links_to) { + if (gf_uuid_compare(local->skip_unlink.cached_gfid, + local->skip_unlink.hashed_gfid)) { + /*GFID different, return error*/ + DHT_STACK_UNWIND(lookup, frame, -1, ESTALE, NULL, NULL, + NULL, NULL); -int -dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) -{ - char is_linkfile = 0; - char is_dir = 0; - xlator_t *subvol = NULL; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - loc_t *loc = NULL; - call_frame_t *prev = NULL; - int ret = 0; - uint64_t tmp_layout = 0; - dht_layout_t *parent_layout = NULL; - - GF_VALIDATE_OR_GOTO ("dht", frame, err); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); - GF_VALIDATE_OR_GOTO ("dht", this->private, out); - - conf = this->private; - - prev = cookie; - local = frame->local; - loc = &local->loc; + return 0; + } - /* This is required for handling stale linkfile deletion, - * or any more call which happens from this 'loc'. - */ - if (uuid_is_null (local->gfid) && !op_ret) - memcpy (local->gfid, stbuf->ia_gfid, 16); - - if (ENTRY_MISSING (op_ret, op_errno)) { - gf_log (this->name, GF_LOG_TRACE, "Entry %s missing on subvol" - " %s", loc->path, prev->this->name); - if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_ON) { - local->op_errno = ENOENT; - dht_lookup_everywhere (frame, this, loc); - return 0; + ret = dht_layout_preset(this, cached_subvol, local->loc.inode); + if (ret) { + gf_msg(this->name, GF_LOG_INFO, 0, + DHT_MSG_LAYOUT_PRESET_FAILED, + "Could not set pre-set layout " + "for subvolume %s", + cached_subvol->name); } - if ((conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) && - (loc->parent)) { - ret = inode_ctx_get (loc->parent, this, &tmp_layout); - parent_layout = (dht_layout_t *)(long)tmp_layout; - if (parent_layout->search_unhashed) { - local->op_errno = ENOENT; - dht_lookup_everywhere (frame, this, loc); - return 0; - } + + local->op_ret = (ret == 0) ? ret : -1; + local->op_errno = (ret == 0) ? ret : EINVAL; + + /* Presence of local->cached_subvol validates + * that lookup from cached node is successful + */ + + if (!local->op_ret && local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); } - } - if (op_ret == 0) { - is_dir = check_is_dir (inode, stbuf, xattr); - if (is_dir) { - local->inode = inode_ref (inode); - local->xattr = dict_ref (xattr); + gf_msg_debug(this->name, 0, + "Skipped unlinking linkto file " + "on the hashed subvolume. " + "Returning success as it is a " + "valid linkto file. Path:%s", + local->loc.path); + + goto unwind_hashed_and_cached; + } else { + local->skip_unlink.handle_valid_link = _gf_false; + + gf_msg_debug(this->name, 0, + "Linkto file found on hashed " + "subvol " + "and data file found on cached " + "subvolume. But linkto points to " + "different cached subvolume (%s) " + "path %s", + (local->skip_unlink.hash_links_to + ? local->skip_unlink.hash_links_to->name + : " <nil>"), + local->loc.path); + + if (local->skip_unlink.opend_fd_count == 0) { + ret = dht_fill_dict_to_avoid_unlink_of_migrating_file( + local->xattr_req); + + if (ret) { + DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, + NULL, NULL); + } else { + local->call_cnt = 1; + FRAME_SU_DO(frame, dht_local_t); + STACK_WIND(frame, dht_lookup_unlink_of_false_linkto_cbk, + hashed_subvol, hashed_subvol->fops->unlink, + &local->loc, 0, local->xattr_req); + } + + return 0; } + } } + } + +preset_layout: - if (is_dir || (op_ret == -1 && op_errno == ENOTCONN)) { - dht_lookup_directory (frame, this, &local->loc); + if (found_non_linkto_on_hashed) { + if (local->need_lookup_everywhere) { + if (gf_uuid_compare(local->gfid, local->inode->gfid)) { + /* GFID different, return error */ + DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, + NULL); return 0; + } } - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, "Lookup of %s for subvolume" - " %s failed with error %s", loc->path, prev->this->name, - strerror (op_errno)); - goto out; + local->op_ret = 0; + local->op_errno = 0; + layout = dht_layout_for_subvol(this, cached_subvol); + if (!layout) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "%s: no pre-set layout for subvolume %s," + " gfid = %s", + local->loc.path, + (cached_subvol ? cached_subvol->name : "<nil>"), gfid); } - is_linkfile = check_is_linkfile (inode, stbuf, xattr); - is_dir = check_is_dir (inode, stbuf, xattr); - - if (!is_dir && !is_linkfile) { - /* non-directory and not a linkfile */ - - dht_itransform (this, prev->this, stbuf->ia_ino, - &stbuf->ia_ino); - if (loc->parent) - postparent->ia_ino = loc->parent->ino; - - ret = dht_layout_preset (this, prev->this, inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "could not set pre-set layout for subvolume %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto out; - } - goto out; + ret = dht_layout_set(this, local->inode, layout); + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "%s: failed to set layout for subvol %s, " + "gfid = %s", + local->loc.path, + (cached_subvol ? cached_subvol->name : "<nil>"), gfid); } - if (is_linkfile) { - subvol = dht_linkfile_subvol (this, inode, stbuf, xattr); - - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "linkfile not having link subvolume. path=%s", - loc->path); - dht_lookup_everywhere (frame, this, loc); - return 0; - } - - STACK_WIND (frame, dht_lookup_linkfile_cbk, - subvol, subvol->fops->lookup, - &local->loc, local->xattr_req); + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); } + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); return 0; + } -out: - /* - * FIXME: postparent->ia_size and postparent->st_blocks do not have - * correct values. since, postparent corresponds to a directory these - * two members should have values equal to sum of corresponding values - * from each of the subvolume. See dht_iatt_merge for reference. - */ + if (!hashed_subvol) { + gf_msg_debug(this->name, 0, + "Cannot create linkfile for %s on %s: " + "hashed subvolume cannot be found, gfid = %s.", + local->loc.path, cached_subvol->name, gfid); - WIPE (postparent); + local->op_ret = 0; + local->op_errno = 0; - DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr, - postparent); -err: - return 0; + ret = dht_layout_preset(frame->this, cached_subvol, local->inode); + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED, + "Failed to set layout for subvol %s" + ", gfid = %s", + cached_subvol ? cached_subvol->name : "<nil>", gfid); + local->op_ret = -1; + local->op_errno = EINVAL; + } + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + return 0; + } + + if (frame->root->op != GF_FOP_RENAME) { + local->current = &local->lock[0]; + ret = dht_protect_namespace(frame, &local->loc, hashed_subvol, + &local->current->ns, + dht_call_lookup_linkfile_create); + } else { + gf_msg_debug(this->name, 0, + "Creating linkto file on %s(hash) to %s on %s " + "(gfid = %s)", + hashed_subvol->name, local->loc.path, cached_subvol->name, + gfid); + + ret = dht_linkfile_create(frame, dht_lookup_linkfile_create_cbk, this, + cached_subvol, hashed_subvol, &local->loc); + } + + return ret; + +unwind_hashed_and_cached: + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + return 0; } +static int +dht_lookup_everywhere_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xattr, + struct iatt *postparent) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + int is_linkfile = 0; + int is_dir = 0; + loc_t *loc = NULL; + xlator_t *link_subvol = NULL; + int ret = -1; + int32_t fd_count = 0; + dht_conf_t *conf = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + dict_t *dict_req = {0}; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + GF_VALIDATE_OR_GOTO("dht", this->private, out); + + local = frame->local; + loc = &local->loc; + conf = this->private; + + prev = cookie; + + gf_msg_debug(this->name, 0, + "returned with op_ret %d and op_errno %d (%s) " + "from subvol %s", + op_ret, op_errno, loc->path, prev->name); + + LOCK(&frame->lock); + { + if (op_ret == -1) { + if (op_errno != ENOENT) + local->op_errno = op_errno; + if (op_errno == ENODATA) + local->gfid_missing = _gf_true; + goto unlock; + } -int -dht_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) -{ - xlator_t *subvol = NULL; - xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - int op_errno = -1; - dht_layout_t *layout = NULL; - int i = 0; - int call_cnt = 0; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - conf = this->private; - if (!conf) - goto err; + if (gf_uuid_is_null(local->gfid)) + gf_uuid_copy(local->gfid, buf->ia_gfid); - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; - } - if (!dht_filter_loc_subvol_key (this, loc, &local->loc, - &hashed_subvol)) { - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "copying location failed for path=%s", - loc->path); - goto err; - } + gf_uuid_unparse(local->gfid, gfid); + + if (gf_uuid_compare(local->gfid, buf->ia_gfid)) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH, + "%s: gfid differs on subvolume %s," + " gfid local = %s, gfid node = %s", + loc->path, prev->name, gfid, uuid_utoa(buf->ia_gfid)); } - if (xattr_req) { - local->xattr_req = dict_ref (xattr_req); - } else { - local->xattr_req = dict_new (); + is_linkfile = check_is_linkfile(inode, buf, xattr, + conf->link_xattr_name); + + if (is_linkfile) { + link_subvol = dht_linkfile_subvol(this, inode, buf, xattr); + gf_msg_debug(this->name, 0, "found on %s linkfile %s (-> %s)", + prev->name, loc->path, + link_subvol ? link_subvol->name : "''"); + goto unlock; } - if (!hashed_subvol) - hashed_subvol = dht_subvol_get_hashed (this, loc); - cached_subvol = dht_subvol_get_cached (this, loc->inode); + is_dir = check_is_dir(inode, buf, xattr); - local->cached_subvol = cached_subvol; - local->hashed_subvol = hashed_subvol; + /* non linkfile GFID takes precedence but don't overwrite + gfid if we have already found a cached file*/ + if (!local->cached_subvol) + gf_uuid_copy(local->gfid, buf->ia_gfid); - if (is_revalidate (loc)) { - local->layout = layout = dht_layout_get (this, loc->inode); + if (is_dir) { + local->dir_count++; - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "revalidate without cache. path=%s", - loc->path); - op_errno = EINVAL; - goto err; - } + gf_msg_debug(this->name, 0, "found on %s directory %s", prev->name, + loc->path); + } else { + local->file_count++; - if (layout->gen && (layout->gen < conf->gen)) { - gf_log (this->name, GF_LOG_TRACE, - "incomplete layout failure for path=%s", - loc->path); + gf_msg_debug(this->name, 0, "found cached file on %s for %s", + prev->name, loc->path); - dht_layout_unref (this, local->layout); - local->layout = NULL; - goto do_fresh_lookup; - } + if (!local->cached_subvol) { + /* found one file */ + dht_iatt_merge(this, &local->stbuf, buf); - local->inode = inode_ref (loc->inode); - local->ia_ino = loc->inode->ino; + local->xattr = dict_ref(xattr); + local->cached_subvol = prev; - local->call_cnt = 1; - call_cnt = local->call_cnt; + gf_msg_debug(this->name, 0, + "storing cached on %s file" + " %s", + prev->name, loc->path); - /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, - * revalidates directly go to the cached-subvolume. - */ - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + dht_iatt_merge(this, &local->postparent, postparent); - subvol = local->cached_subvol; + gf_uuid_copy(local->skip_unlink.cached_gfid, buf->ia_gfid); + } else { + /* This is where we need 'rename' both entries logic */ + gf_msg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_FILE_ON_MULT_SUBVOL, + "multiple subvolumes (%s and %s) have " + "file %s (preferably rename the file " + "in the backend,and do a fresh lookup)", + local->cached_subvol->name, prev->name, local->loc.path); + } + } + } +unlock: + UNLOCK(&frame->lock); + + if (is_linkfile) { + ret = dict_get_int32(xattr, GLUSTERFS_OPEN_FD_COUNT, &fd_count); + + /* Any linkto file found on the non-hashed subvolume should + * be unlinked (performed in the "else if" block below) + * + * But if a linkto file is found on hashed subvolume, it may be + * pointing to valid cached node. So unlinking of linkto + * file on hashed subvolume is skipped and inside + * dht_lookup_everywhere_done, checks are performed. If this + * linkto file is found as stale linkto file, it is deleted + * otherwise unlink is skipped. + */ - STACK_WIND (frame, dht_revalidate_cbk, - subvol, subvol->fops->lookup, - &local->loc, local->xattr_req); + if (local->hashed_subvol && local->hashed_subvol == prev) { + local->skip_unlink.handle_valid_link = _gf_true; + local->skip_unlink.opend_fd_count = fd_count; + local->skip_unlink.hash_links_to = link_subvol; + gf_uuid_copy(local->skip_unlink.hashed_gfid, buf->ia_gfid); + + gf_msg_debug(this->name, 0, + "Found" + " one linkto file on hashed subvol %s " + "for %s: Skipping unlinking till " + "everywhere_done", + prev->name, loc->path); + + } else if (!ret && (fd_count == 0)) { + dict_req = dict_new(); + + ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(dict_req); + + if (ret) { + /* Skip unlinking for dict_failure + *File is found as a linkto file on non-hashed, + *subvolume. In the current implementation, + *finding a linkto-file on non-hashed does not + *always implies that it is stale. So deletion + *of file should be done only when both fd is + *closed and linkto-xattr is set. In case of + *dict_set failure, avoid skipping of file. + *NOTE: dht_frame_return should get called for + * this block. + */ - } else { - do_fresh_lookup: - /* TODO: remove the hard-coding */ - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); - - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht.linkto", 256); - - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s, " - "checking on all the subvols to see if " - "it is a directory", loc->path); - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; - - local->layout = dht_layout_new (this, - conf->subvolume_cnt); - if (!local->layout) { - op_errno = ENOMEM; - goto err; - } + dict_unref(dict_req); + + } else { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "attempting deletion of stale linkfile " + "%s on %s (hashed subvol is %s)", + loc->path, prev->name, + (local->hashed_subvol ? local->hashed_subvol->name + : "<null>")); + /* * + * These stale files may be created using root + * user. Hence deletion will work only with + * root. + */ + FRAME_SU_DO(frame, dht_local_t); + STACK_WIND(frame, dht_lookup_unlink_cbk, prev, + prev->fops->unlink, loc, 0, dict_req); - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - &local->loc, local->xattr_req); - } - return 0; - } + dict_unref(dict_req); - STACK_WIND (frame, dht_lookup_cbk, - hashed_subvol, hashed_subvol->fops->lookup, - loc, local->xattr_req); + return 0; + } } + } - return 0; + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + dht_lookup_everywhere_done(frame, this); + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); - return 0; +out: + return ret; } - int -dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *prebuf, - struct iatt *postbuf) +dht_lookup_everywhere(call_frame_t *frame, xlator_t *this, loc_t *loc) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int i = 0; + int call_cnt = 0; - GF_VALIDATE_OR_GOTO ("dht", frame, err); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", this->private, out); + GF_VALIDATE_OR_GOTO("dht", loc, out); - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - local->op_ret = -1; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + conf = this->private; + local = frame->local; - dht_iatt_merge (this, &local->prebuf, prebuf, prev->this); - dht_iatt_merge (this, &local->stbuf, postbuf, prev->this); + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; - if (local->inode) { - local->stbuf.ia_ino = local->inode->ino; - local->prebuf.ia_ino = local->inode->ino; - } + if (!local->inode) + local->inode = inode_ref(loc->inode); - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); + gf_msg_debug(this->name, 0, "winding lookup call to %d subvols", call_cnt); + + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_lookup_everywhere_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, loc, + local->xattr_req); + } + + return 0; out: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (truncate, frame, local->op_ret, local->op_errno, - &local->prebuf, &local->stbuf); + DHT_STACK_UNWIND(lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL); err: - return 0; + return -1; } - - int -dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *stbuf) +dht_lookup_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; + xlator_t *prev = NULL; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + loc_t *loc = NULL; + dht_conf_t *conf = NULL; + int ret = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, unwind); + GF_VALIDATE_OR_GOTO("dht", frame->local, unwind); + GF_VALIDATE_OR_GOTO("dht", this->private, unwind); + GF_VALIDATE_OR_GOTO("dht", cookie, unwind); + + prev = cookie; + subvol = prev; + conf = this->private; + local = frame->local; + loc = &local->loc; + + gf_uuid_unparse(loc->gfid, gfid); + + if (op_ret == -1) { + gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_LINK_FILE_LOOKUP_INFO, + "Lookup of %s on %s (following linkfile) failed " + ",gfid = %s", + local->loc.path, subvol->name, gfid); + + /* If cached subvol returned ENOTCONN, do not do + lookup_everywhere. We need to make sure linkfile does not get + removed, which can take away the namespace, and subvol is + anyways down. */ + + local->cached_subvol = NULL; + if (op_errno != ENOTCONN) + goto err; + else + goto unwind; + } + + if (check_is_dir(inode, stbuf, xattr)) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LINK_FILE_LOOKUP_INFO, + "Lookup of %s on %s (following linkfile) reached dir," + " gfid = %s", + local->loc.path, subvol->name, gfid); + goto err; + } + + if (check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name)) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LINK_FILE_LOOKUP_INFO, + "lookup of %s on %s (following linkfile) reached link," + "gfid = %s", + local->loc.path, subvol->name, gfid); + goto err; + } + + if (gf_uuid_compare(local->gfid, stbuf->ia_gfid)) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH, + "%s: gfid different on data file on %s," + " gfid local = %s, gfid node = %s ", + local->loc.path, subvol->name, gfid, uuid_utoa(stbuf->ia_gfid)); + goto err; + } + + if ((stbuf->ia_nlink == 1) && (conf && conf->unhashed_sticky_bit)) { + stbuf->ia_prot.sticky = 1; + } + + ret = dht_layout_preset(this, prev, inode); + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED, + "Failed to set layout for subvolume %s," + "gfid = %s", + prev->name, gfid); + op_ret = -1; + op_errno = EINVAL; + } + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1); + } - GF_VALIDATE_OR_GOTO ("dht", frame, err); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); +unwind: + DHT_STRIP_PHASE1_FLAGS(stbuf); + dht_set_fixed_dir_stat(postparent); + DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr, + postparent); - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + return 0; - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); +err: + dht_lookup_everywhere(frame, this, loc); +out: + return 0; +} - if (local->inode) - local->stbuf.ia_ino = local->inode->ino; - local->op_ret = 0; +/* Code to get hashed subvol based on inode and loc + First it check if loc->parent and loc->path exist then it get + hashed subvol based on loc. +*/ + +static gf_boolean_t +dht_should_lookup_everywhere(xlator_t *this, dht_conf_t *conf, loc_t *loc) +{ + dht_layout_t *parent_layout = NULL; + int ret = 0; + gf_boolean_t lookup_everywhere = _gf_true; + + /* lookup-optimize supersedes lookup-unhashed settings. + * If it is set, do not process search_unhashed + * If lookup-optimize if enabled, lookup everywhere if: + * - this is the rebalance daemon. + * - loc->parent is unavailable. + * - parent_layout is unavailable + * - parent_layout->commit_hash != conf->vol_commit_hash + */ + + if (conf->lookup_optimize) { + if (!conf->defrag && loc->parent) { + ret = dht_inode_ctx_layout_get(loc->parent, this, &parent_layout); + if (!ret && parent_layout && + (parent_layout->commit_hash == conf->vol_commit_hash)) { + lookup_everywhere = _gf_false; + } + } + goto out; + } else { + if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) { + if (loc->parent) { + ret = dht_inode_ctx_layout_get(loc->parent, this, + &parent_layout); + if (ret || !parent_layout || + (!parent_layout->search_unhashed)) { + lookup_everywhere = _gf_false; + } + } else { + lookup_everywhere = _gf_false; + } + + goto out; } -unlock: - UNLOCK (&frame->lock); + } out: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno, - &local->stbuf); -err: - return 0; + return lookup_everywhere; } - int -dht_stat (call_frame_t *frame, xlator_t *this, - loc_t *loc) +dht_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int i = 0; - + char is_linkfile = 0; + char is_dir = 0; + xlator_t *subvol = NULL; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; + xlator_t *prev = NULL; + int ret = 0; + uint32_t vol_commit_hash = 0; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + GF_VALIDATE_OR_GOTO("dht", this->private, out); + + conf = this->private; + + prev = cookie; + local = frame->local; + loc = &local->loc; + + gf_msg_debug(this->name, op_errno, + "%s: fresh_lookup on %s returned with op_ret %d", loc->path, + prev->name, op_ret); + + if (op_ret == -1) { + if (ENTRY_MISSING(op_ret, op_errno)) { + if (1 == conf->subvolume_cnt) { + /* No need to lookup again */ + goto out; + } - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + gf_msg_debug(this->name, 0, "Entry %s missing on subvol %s", + loc->path, prev->name); + if (dht_should_lookup_everywhere(this, conf, loc)) { + local->op_errno = ENOENT; + dht_lookup_everywhere(frame, this, loc); + return 0; + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; + } else { + /* posix returns ENODATA if the gfid is not set but the client and + * server protocol layers do not send the stbuf. We need to + * heal this so check if this is a directory on the other subvols. + */ + if ((op_errno == ENOTCONN) || (op_errno == ENODATA)) { + dht_lookup_directory(frame, this, &local->loc); + return 0; + } + } + gf_msg_debug(this->name, op_errno, "%s: Lookup on subvolume %s failed", + loc->path, prev->name); + goto out; + } + + /* Lookup succeeded - op_ret = 0 */ + + /* This is required for handling stale linkfile deletion, + * or any more call which happens from this 'loc'. + */ + if (gf_uuid_is_null(local->gfid)) { + /*This is set from the first successful response*/ + memcpy(local->gfid, stbuf->ia_gfid, 16); + } + + if (!conf->vch_forced) { + /* Update the commit hash in conf if it is found */ + ret = dict_get_uint32(xattr, conf->commithash_xattr_name, + &vol_commit_hash); + if (ret == 0) { + conf->vol_commit_hash = vol_commit_hash; } + } - local->layout = layout = dht_layout_get (this, loc->inode); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + is_dir = check_is_dir(inode, stbuf, xattr); + if (is_dir) { + /* A directory is present on all subvols, send the lookup to + * all subvols now */ + local->inode = inode_ref(inode); + local->xattr = dict_ref(xattr); + dht_lookup_directory(frame, this, &local->loc); + return 0; + } - local->inode = inode_ref (loc->inode); - local->call_cnt = layout->cnt; + is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name); - for (i = 0; i < layout->cnt; i++) { - subvol = layout->list[i].xlator; + if (!is_linkfile) { + /* non-directory and not a linkto file. This is a data file + * Update the layout to point to the cached subvol + */ - STACK_WIND (frame, dht_attr_cbk, - subvol, subvol->fops->stat, - loc); + ret = dht_layout_preset(this, prev, inode); + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED, + "%s: could not set pre-set layout for subvolume %s", + loc->path, prev->name); + op_ret = -1; + op_errno = EINVAL; + goto out; } + goto out; + } + /* This is a linkto file. Get the value of the target subvol from the + * linkto xattr and lookup there to see if the file exists + */ + subvol = dht_linkfile_subvol(this, inode, stbuf, xattr); + if (!subvol) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "%s: No link subvol for linkto", loc->path); + dht_lookup_everywhere(frame, this, loc); return 0; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL); + gf_msg_debug(this->name, 0, "%s: Calling lookup on linkto target %s", + loc->path, subvol->name); - return 0; -} + STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol, + subvol->fops->lookup, &local->loc, local->xattr_req); + return 0; -int -dht_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd) +out: + /* + * FIXME: postparent->ia_size and postparent->st_blocks do not have + * correct values. since, postparent corresponds to a directory these + * two members should have values equal to sum of corresponding values + * from each of the subvolume. See dht_iatt_merge for reference. + */ + + if (!op_ret && local && local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS(stbuf); + dht_set_fixed_dir_stat(postparent); + DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr, + postparent); +err: + return 0; +} + +/* For directories, check if acl xattrs have been requested (by the acl + * xlator), if not, request for them. These xattrs are needed for dht dir + * self-heal to perform proper self-healing of dirs + */ +static void +dht_check_and_set_acl_xattr_req(xlator_t *this, dict_t *xattr_req) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int i = 0; + int ret = 0; + GF_ASSERT(xattr_req); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + if (!dict_get(xattr_req, POSIX_ACL_ACCESS_XATTR)) { + ret = dict_set_int8(xattr_req, POSIX_ACL_ACCESS_XATTR, 0); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value:key = %s", + POSIX_ACL_ACCESS_XATTR); + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; - } + if (!dict_get(xattr_req, POSIX_ACL_DEFAULT_XATTR)) { + ret = dict_set_int8(xattr_req, POSIX_ACL_DEFAULT_XATTR, 0); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value:key = %s", + POSIX_ACL_DEFAULT_XATTR); + } - local->layout = layout = dht_layout_get (this, fd->inode); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + return; +} + +/* for directories, we need the following info: + * the layout : trusted.glusterfs.dht + * the mds information : trusted.glusterfs.dht.mds + * the acl info: See above + */ +static int +dht_set_dir_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + int ret = -EINVAL; + dht_conf_t *conf = NULL; + + conf = this->private; + if (!conf) { + goto err; + } + + if (!xattr_req) { + goto err; + } + + /* Xattr to get the layout for a directory + */ + ret = dict_set_uint32(xattr_req, conf->xattr_name, 4 * 4); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value:key = %s for " + "path %s", + conf->xattr_name, loc->path); + goto err; + } + + /*Non-fatal failure */ + ret = dict_set_uint32(xattr_req, conf->mds_xattr_key, 4); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value:key = %s for " + "path %s", + conf->mds_xattr_key, loc->path); + } + + dht_check_and_set_acl_xattr_req(this, xattr_req); + ret = 0; +err: + return ret; +} - local->inode = inode_ref (fd->inode); - local->call_cnt = layout->cnt;; +/* If the hashed subvol is present, send the lookup to only that subvol first. + * If no hashed subvol, send a lookup to all subvols and proceed based on the + * responses. + */ +static int +dht_do_fresh_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + int ret = -1; + dht_conf_t *conf = NULL; + xlator_t *hashed_subvol = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int call_cnt = 0; + int i = 0; + + conf = this->private; + if (!conf) { + op_errno = EINVAL; + goto err; + } + + local = frame->local; + if (!local) { + op_errno = EINVAL; + goto err; + } + + /* Since we don't know whether this is a file or a directory, + * request all xattrs*/ + ret = dht_set_file_xattr_req(this, loc, local->xattr_req); + if (ret) { + op_errno = -ret; + goto err; + } + + ret = dht_set_dir_xattr_req(this, loc, local->xattr_req); + if (ret) { + op_errno = -ret; + goto err; + } + + /* Fuse sets a random value in gfid-req. If the gfid is missing + * on one or more subvols, posix will set the gfid to this value, + * causing GFID mismatches for directories. Remove the value fuse + * has sent before sending the lookup. + */ + ret = dict_get_gfuuid(local->xattr_req, "gfid-req", &local->gfid_req); + if (ret) { + gf_msg_debug(this->name, 0, "%s: No gfid-req available", loc->path); + } else { + dict_del(local->xattr_req, "gfid-req"); + } + /* This should have been set in dht_lookup */ + hashed_subvol = local->hashed_subvol; + + if (!hashed_subvol) { + gf_msg_debug(this->name, 0, + "%s: no subvolume in layout for path, " + "checking on all the subvols to see if " + "it is a directory", + loc->path); - for (i = 0; i < layout->cnt; i++) { - subvol = layout->list[i].xlator; - STACK_WIND (frame, dht_attr_cbk, - subvol, subvol->fops->fstat, - fd); + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + /* Allocate a layout. This will be populated and saved in + * the dht inode_ctx on successful lookup + */ + local->layout = dht_layout_new(this, conf->subvolume_cnt); + if (!local->layout) { + op_errno = ENOMEM; + goto err; } + gf_msg_debug(this->name, 0, + "%s: Found null hashed subvol. Calling lookup" + " on all nodes.", + loc->path); + + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, + local->xattr_req); + } return 0; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fstat, frame, -1, op_errno, NULL); + /* if the hashed_subvol is non-null, send the lookup there first so + * as to see whether we have a file or a directory */ + gf_msg_debug(this->name, 0, "%s: Calling fresh lookup on %s", loc->path, + hashed_subvol->name); - return 0; + STACK_WIND_COOKIE(frame, dht_lookup_cbk, hashed_subvol, hashed_subvol, + hashed_subvol->fops->lookup, loc, local->xattr_req); + return 0; +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; } - -int -dht_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset) +static int +dht_do_revalidate(call_frame_t *frame, xlator_t *this, loc_t *loc) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - + xlator_t *subvol = NULL; + xlator_t *mds_subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + int gen = 0; + + conf = this->private; + if (!conf) { + op_errno = EINVAL; + goto err; + } + + local = frame->local; + if (!local) { + op_errno = EINVAL; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg_debug(this->name, 0, + "path = %s. No layout found in the inode ctx.", loc->path); + op_errno = EINVAL; + goto err; + } + + /* Generation number has changed. This layout may be stale. */ + if (layout->gen && (layout->gen < conf->gen)) { + gen = layout->gen; + dht_layout_unref(this, local->layout); + local->layout = NULL; + local->cached_subvol = NULL; + + gf_msg_debug(this->name, 0, + "path = %s. In memory layout may be stale." + "(layout->gen (%d) is less than " + "conf->gen (%d)). Calling fresh lookup.", + loc->path, gen, conf->gen); + + dht_do_fresh_lookup(frame, this, loc); + return 0; + } + + local->inode = inode_ref(loc->inode); + + /* Since we don't know whether this has changed, + * request all xattrs*/ + ret = dht_set_file_xattr_req(this, loc, local->xattr_req); + if (ret) { + op_errno = -ret; + goto err; + } + + ret = dht_set_dir_xattr_req(this, loc, local->xattr_req); + if (ret) { + op_errno = -ret; + goto err; + } + + if (IA_ISDIR(local->inode->ia_type)) { + ret = dht_inode_ctx_mdsvol_get(local->inode, this, &mds_subvol); + if (ret || !mds_subvol) { + gf_msg_debug(this->name, 0, "path = %s. No mds subvol in inode ctx", + local->loc.path); + } + local->mds_subvol = mds_subvol; + local->call_cnt = conf->subvolume_cnt; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + /* local->call_cnt will change as responses are processed. Always use a + * local copy to loop through the STACK_WIND calls + */ - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + call_cnt = local->call_cnt; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_revalidate_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, loc, + local->xattr_req); } + return 0; + } - local->inode = inode_ref (loc->inode); - local->call_cnt = 1; + /* If not a dir, this should be 1 */ + local->call_cnt = layout->cnt; + call_cnt = local->call_cnt; - STACK_WIND (frame, dht_truncate_cbk, - subvol, subvol->fops->truncate, - loc, offset); + for (i = 0; i < call_cnt; i++) { + subvol = layout->list[i].xlator; - return 0; + gf_msg_debug(this->name, 0, + "path = %s. Calling " + "revalidate lookup on %s", + loc->path, subvol->name); + STACK_WIND_COOKIE(frame, dht_revalidate_cbk, subvol, subvol, + subvol->fops->lookup, &local->loc, local->xattr_req); + } + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL); - - return 0; + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; } +/* Depending on the input, decide if this is a: + * fresh-lookup: loc->name is provided but no dht inode ctx + * revalidation: loc->name is provided, dht inode ctx is present + * discover: gfid based nameless lookup. + */ int -dht_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset) +dht_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + xlator_t *hashed_subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + loc_t new_loc = { + 0, + }; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + + conf = this->private; + if (!conf) + goto err; + + local = dht_local_init(frame, loc, NULL, GF_FOP_LOOKUP); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + ret = dht_filter_loc_subvol_key(this, loc, &new_loc, &hashed_subvol); + if (ret) { + loc_wipe(&local->loc); + ret = loc_dup(&new_loc, &local->loc); + + /* we no longer need 'new_loc' entries */ + loc_wipe(&new_loc); + + /* check if loc_dup() is successful */ + if (ret == -1) { + op_errno = errno; + gf_msg_debug(this->name, errno, + "copying location failed for path=%s", loc->path); + goto err; + } + } + + if (xattr_req) { + local->xattr_req = dict_ref(xattr_req); + } else { + local->xattr_req = dict_new(); + } + + /* Nameless lookup */ + + /* This is usually sent by NFS. Lookups are done based on the gfid and + * no name information is available. Without the name, dht cannot calculate + * the hash and has to send a lookup to all subvols. + */ + if (gf_uuid_is_null(loc->pargfid) && !gf_uuid_is_null(loc->gfid) && + !__is_root_gfid(loc->inode->gfid)) { + local->cached_subvol = NULL; + dht_do_discover(frame, this, loc); + return 0; + } + + if (loc_is_root(loc)) { + /* Request the DHT commit hash xattr (trusted.glusterfs.dht.commithash) + * set on the brick root. + */ + ret = dict_set_uint32(local->xattr_req, conf->commithash_xattr_name, + sizeof(uint32_t)); + } + + if (!hashed_subvol) + hashed_subvol = dht_subvol_get_hashed(this, loc); + local->hashed_subvol = hashed_subvol; + + if (is_revalidate(loc)) { + /* The entry has been looked up before and has a dht inode_ctx + */ + dht_do_revalidate(frame, this, loc); + return 0; + } else { + /* Entry has not been looked up before + */ + dht_do_fresh_lookup(frame, this, loc); + return 0; + } + return 0; +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; +} - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); +static int +dht_unlink_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + local = frame->local; + prev = cookie; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; + LOCK(&frame->lock); + { + if ((op_ret == -1) && + !((op_errno == ENOENT) || (op_errno == ENOTCONN))) { + local->op_errno = op_errno; + UNLOCK(&frame->lock); + gf_msg_debug(this->name, op_errno, + "Unlink link: subvolume %s returned -1", prev->name); + goto post_unlock; } - local->inode = inode_ref (fd->inode); - local->call_cnt = 1; + local->op_ret = 0; + } + UNLOCK(&frame->lock); +post_unlock: + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); + DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, xdata); + + return 0; +} - STACK_WIND (frame, dht_truncate_cbk, - subvol, subvol->fops->ftruncate, - fd, offset); +static int +dht_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + xlator_t *hashed_subvol = NULL; - return 0; + local = frame->local; + prev = cookie; -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL); + LOCK(&frame->lock); + { + if (op_ret == -1) { + if (op_errno != ENOENT) { + local->op_ret = -1; + local->op_errno = op_errno; + } else { + local->op_ret = 0; + } + UNLOCK(&frame->lock); + gf_msg_debug(this->name, op_errno, + "Unlink: subvolume %s returned -1", prev->name); + goto post_unlock; + } - return 0; + local->op_ret = 0; + + local->postparent = *postparent; + local->preparent = *preparent; + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->preparent, 0); + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); + } + } + UNLOCK(&frame->lock); +post_unlock: + if (!local->op_ret) { + hashed_subvol = dht_subvol_get_hashed(this, &local->loc); + if (hashed_subvol && hashed_subvol != local->cached_subvol) { + /* + * If hashed and cached are different, then we need + * to unlink linkfile from hashed subvol if data + * file is deleted successfully + */ + STACK_WIND_COOKIE(frame, dht_unlink_linkfile_cbk, hashed_subvol, + hashed_subvol, hashed_subvol->fops->unlink, + &local->loc, local->flags, xdata); + return 0; + } + } + + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); + DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, xdata); + + return 0; } +static int +dht_common_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata); + return 0; +} -int -dht_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent) +static int +dht_fix_layout_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + if (op_ret == 0) { + /* update the layout in the inode ctx */ local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_ret = -1; - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + layout = local->selfheal.layout; - preparent->ia_ino = local->loc.parent->ino; - postparent->ia_ino = local->loc.parent->ino; - local->op_ret = 0; + dht_layout_set(this, local->loc.inode, layout); + } + + DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata); + return 0; +} + +static int +dht_err_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; - local->postparent = *postparent; - local->preparent = *preparent; + local = frame->local; + prev = cookie; - WIPE (&local->postparent); - WIPE (&local->preparent); + LOCK(&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + UNLOCK(&frame->lock); + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto post_unlock; } -unlock: - UNLOCK (&frame->lock); - DHT_STACK_UNWIND (unlink, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent); + local->op_ret = 0; + } + UNLOCK(&frame->lock); +post_unlock: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + if ((local->fop == GF_FOP_SETXATTR) || + (local->fop == GF_FOP_FSETXATTR)) { + DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno, + NULL); + /* 'local' itself may not be valid after this */ + goto out; + } + if ((local->fop == GF_FOP_REMOVEXATTR) || + (local->fop == GF_FOP_FREMOVEXATTR)) { + DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno, + NULL); + } + } - return 0; +out: + return 0; } +/* Set the value[] of key into dict after convert from + host byte order to network byte order +*/ +int32_t +dht_dict_set_array(dict_t *dict, char *key, int32_t value[], int32_t size) +{ + int ret = -1; + int32_t *ptr = NULL; + int32_t vindex; + + if (value == NULL) { + return -EINVAL; + } + + ptr = GF_MALLOC(sizeof(int32_t) * size, gf_common_mt_char); + if (ptr == NULL) { + return -ENOMEM; + } + for (vindex = 0; vindex < size; vindex++) { + ptr[vindex] = hton32(value[vindex]); + } + ret = dict_set_bin(dict, key, ptr, sizeof(int32_t) * size); + if (ret) + GF_FREE(ptr); + return ret; +} -int -dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent) +static int +dht_common_mds_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; + call_frame_t *prev = cookie; - xlator_t *cached_subvol = NULL; + local = frame->local; - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + if (op_ret) + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->this->name); - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); + if (local->fop == GF_FOP_SETXATTR) { + DHT_STACK_UNWIND(setxattr, frame, 0, op_errno, local->xdata); + /* 'local' itself may not be valid after this */ + goto out; + } - if (op_ret == -1) - goto err; + if (local->fop == GF_FOP_FSETXATTR) { + DHT_STACK_UNWIND(fsetxattr, frame, 0, op_errno, local->xdata); + /* 'local' itself may not be valid after this */ + goto out; + } - cached_subvol = dht_subvol_get_cached (this, local->loc.inode); - if (!cached_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", - local->loc.path); - local->op_errno = EINVAL; - goto err; + if (local->fop == GF_FOP_REMOVEXATTR) { + DHT_STACK_UNWIND(removexattr, frame, 0, op_errno, NULL); + /* 'local' itself may not be valid after this */ + goto out; + } + + if (local->fop == GF_FOP_FREMOVEXATTR) { + DHT_STACK_UNWIND(fremovexattr, frame, 0, op_errno, NULL); + } + +out: + return 0; +} + +/* Code to wind a xattrop call to add 1 on current mds internal xattr + value +*/ +static int +dht_setxattr_non_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + int ret = 0; + dict_t *xattrop = NULL; + int32_t addone[1] = {1}; + call_frame_t *prev = NULL; + dht_conf_t *conf = NULL; + + local = frame->local; + prev = cookie; + conf = this->private; + + LOCK(&frame->lock); + { + if (op_ret && !local->op_ret) { + local->op_ret = op_ret; + local->op_errno = op_errno; + UNLOCK(&frame->lock); + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->this->name); + goto post_unlock; + } + } + UNLOCK(&frame->lock); +post_unlock: + this_call_cnt = dht_frame_return(frame); + + if (is_last_call(this_call_cnt)) { + if (!local->op_ret) { + xattrop = dict_new(); + if (!xattrop) { + gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, 0, + "dictionary creation failed"); + ret = -1; + goto out; + } + ret = dht_dict_set_array(xattrop, conf->mds_xattr_key, addone, 1); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "dictionary set array failed "); + ret = -1; + goto out; + } + if ((local->fop == GF_FOP_SETXATTR) || + (local->fop == GF_FOP_REMOVEXATTR)) { + STACK_WIND(frame, dht_common_mds_xattrop_cbk, local->mds_subvol, + local->mds_subvol->fops->xattrop, &local->loc, + GF_XATTROP_ADD_ARRAY, xattrop, NULL); + } else { + STACK_WIND(frame, dht_common_mds_xattrop_cbk, local->mds_subvol, + local->mds_subvol->fops->fxattrop, local->fd, + GF_XATTROP_ADD_ARRAY, xattrop, NULL); + } + } else { + if (local->fop == GF_FOP_SETXATTR) { + DHT_STACK_UNWIND(setxattr, frame, 0, 0, local->xdata); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_FSETXATTR) { + DHT_STACK_UNWIND(fsetxattr, frame, 0, 0, local->xdata); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_REMOVEXATTR) { + DHT_STACK_UNWIND(removexattr, frame, 0, 0, NULL); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_FREMOVEXATTR) { + DHT_STACK_UNWIND(fremovexattr, frame, 0, 0, NULL); + /* 'local' itself may not be valid after this */ + goto just_return; + } + } + } +out: + if (ret) { + if (local->fop == GF_FOP_SETXATTR) { + DHT_STACK_UNWIND(setxattr, frame, 0, 0, local->xdata); + /* 'local' itself may not be valid after this */ + goto just_return; } - STACK_WIND (frame, dht_unlink_cbk, - cached_subvol, cached_subvol->fops->unlink, - &local->loc); + if (local->fop == GF_FOP_FSETXATTR) { + DHT_STACK_UNWIND(fsetxattr, frame, 0, 0, local->xdata); + /* 'local' itself may not be valid after this */ + goto just_return; + } - return 0; + if (local->fop == GF_FOP_REMOVEXATTR) { + DHT_STACK_UNWIND(removexattr, frame, 0, 0, NULL); + /* 'local' itself may not be valid after this */ + goto just_return; + } -err: - DHT_STACK_UNWIND (unlink, frame, -1, local->op_errno, - NULL, NULL); - return 0; + if (local->fop == GF_FOP_FREMOVEXATTR) { + DHT_STACK_UNWIND(fremovexattr, frame, 0, 0, NULL); + } + } +just_return: + if (xattrop) + dict_unref(xattrop); + return 0; } +static int +dht_setxattr_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + call_frame_t *prev = NULL; + xlator_t *mds_subvol = NULL; + int i = 0; + + local = frame->local; + prev = cookie; + conf = this->private; + mds_subvol = local->mds_subvol; + + if (op_ret == -1) { + local->op_ret = op_ret; + local->op_errno = op_errno; + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->this->name); + goto out; + } + + local->op_ret = 0; + local->call_cnt = conf->subvolume_cnt - 1; + local->xdata = dict_ref(xdata); + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (mds_subvol && (mds_subvol == conf->subvolumes[i])) + continue; + if (local->fop == GF_FOP_SETXATTR) { + STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i], + conf->subvolumes[i]->fops->setxattr, &local->loc, + local->xattr, local->flags, local->xattr_req); + } + + if (local->fop == GF_FOP_FSETXATTR) { + STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i], + conf->subvolumes[i]->fops->fsetxattr, local->fd, + local->xattr, local->flags, local->xattr_req); + } + + if (local->fop == GF_FOP_REMOVEXATTR) { + STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i], + conf->subvolumes[i]->fops->removexattr, &local->loc, + local->key, local->xattr_req); + } + + if (local->fop == GF_FOP_FREMOVEXATTR) { + STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i], + conf->subvolumes[i]->fops->fremovexattr, local->fd, + local->key, local->xattr_req); + } + } + + return 0; +out: + if (local->fop == GF_FOP_SETXATTR) { + DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno, + xdata); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_FSETXATTR) { + DHT_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno, + xdata); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_REMOVEXATTR) { + DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno, + NULL); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_FREMOVEXATTR) { + DHT_STACK_UNWIND(fremovexattr, frame, local->op_ret, local->op_errno, + NULL); + } + +just_return: + return 0; +} -int -dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, - int op_errno, struct iatt *prebuf, struct iatt *postbuf) +static int +dht_xattrop_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *dict, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + local->op_errno = op_errno; + local->op_ret = op_ret; + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->this->name); + goto out; + } + + if (local->fop == GF_FOP_SETXATTR) { + STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol, + local->mds_subvol->fops->setxattr, &local->loc, local->xattr, + local->flags, local->xattr_req); + } + + if (local->fop == GF_FOP_FSETXATTR) { + STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol, + local->mds_subvol->fops->fsetxattr, local->fd, local->xattr, + local->flags, local->xattr_req); + } + + if (local->fop == GF_FOP_REMOVEXATTR) { + STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol, + local->mds_subvol->fops->removexattr, &local->loc, + local->key, local->xattr_req); + } + + if (local->fop == GF_FOP_FREMOVEXATTR) { + STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol, + local->mds_subvol->fops->fremovexattr, local->fd, local->key, + local->xattr_req); + } + + return 0; +out: + if (local->fop == GF_FOP_SETXATTR) { + DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno, + xdata); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_FSETXATTR) { + DHT_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno, + xdata); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_REMOVEXATTR) { + DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno, + NULL); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_FREMOVEXATTR) { + DHT_STACK_UNWIND(fremovexattr, frame, local->op_ret, local->op_errno, + NULL); + } + +just_return: + return 0; +} +static void +fill_layout_info(dht_layout_t *layout, char *buf) +{ + int i = 0; + char tmp_buf[128] = { + 0, + }; + + for (i = 0; i < layout->cnt; i++) { + snprintf(tmp_buf, sizeof(tmp_buf), "(%s %u %u)", + layout->list[i].xlator->name, layout->list[i].start, + layout->list[i].stop); + if (i) + strcat(buf, " "); + strcat(buf, tmp_buf); + } +} - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } +static void +dht_fill_pathinfo_xattr(xlator_t *this, dht_local_t *local, char *xattr_buf, + int32_t alloc_len, int flag, char *layout_buf) +{ + if (flag) { + if (local->xattr_val) { + snprintf(xattr_buf, alloc_len, + "((<" DHT_PATHINFO_HEADER "%s> %s) (%s-layout %s))", + this->name, local->xattr_val, this->name, layout_buf); + } else { + snprintf(xattr_buf, alloc_len, "(%s-layout %s)", this->name, + layout_buf); + } + } else if (local->xattr_val) { + snprintf(xattr_buf, alloc_len, "(<" DHT_PATHINFO_HEADER "%s> %s)", + this->name, local->xattr_val); + } else { + xattr_buf[0] = '\0'; + } +} - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); +static int +dht_vgetxattr_alloc_and_fill(dht_local_t *local, dict_t *xattr, xlator_t *this, + int op_errno) +{ + int ret = -1; + char *value = NULL; + + ret = dict_get_str(xattr, local->xsel, &value); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_GET_XATTR_FAILED, + "Subvolume %s returned -1", this->name); + local->op_ret = -1; + local->op_errno = op_errno; + goto out; + } + + local->alloc_len += strlen(value); + + if (!local->xattr_val) { + local->alloc_len += (SLEN(DHT_PATHINFO_HEADER) + 10); + local->xattr_val = GF_MALLOC(local->alloc_len, gf_common_mt_char); + if (!local->xattr_val) { + ret = -1; + goto out; + } + local->xattr_val[0] = '\0'; + } + + int plen = strlen(local->xattr_val); + if (plen) { + /* extra byte(s) for \0 to be safe */ + local->alloc_len += (plen + 2); + local->xattr_val = GF_REALLOC(local->xattr_val, local->alloc_len); + if (!local->xattr_val) { + ret = -1; + goto out; + } + } + + (void)strcat(local->xattr_val, value); + (void)strcat(local->xattr_val, " "); + local->op_ret = 0; + + ret = 0; - if (local && (op_ret == 0)) { - prebuf->ia_ino = local->ia_ino; - postbuf->ia_ino = local->ia_ino; - } +out: + return ret; +} - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno, - prebuf, postbuf); +static int +dht_vgetxattr_fill_and_set(dht_local_t *local, dict_t **dict, xlator_t *this, + gf_boolean_t flag) +{ + int ret = -1; + char *xattr_buf = NULL; + char layout_buf[8192] = { + 0, + }; + + if (flag) + fill_layout_info(local->layout, layout_buf); + + *dict = dict_new(); + if (!*dict) + goto out; + + local->xattr_val[strlen(local->xattr_val) - 1] = '\0'; + + /* we would need max this many bytes to create xattr string + * extra 40 bytes is just an estimated amount of additional + * space required as we include translator name and some + * spaces, brackets etc. when forming the pathinfo string. + * + * For node-uuid we just don't have all the pretty formatting, + * but since this is a generic routine for pathinfo & node-uuid + * we don't have conditional space allocation and try to be + * generic + */ + local->alloc_len += (2 * strlen(this->name)) + strlen(layout_buf) + 40; + xattr_buf = GF_MALLOC(local->alloc_len, gf_common_mt_char); + if (!xattr_buf) + goto out; + + if (XATTR_IS_PATHINFO(local->xsel)) { + (void)dht_fill_pathinfo_xattr(this, local, xattr_buf, local->alloc_len, + flag, layout_buf); + } else if ((XATTR_IS_NODE_UUID(local->xsel)) || + (XATTR_IS_NODE_UUID_LIST(local->xsel))) { + (void)snprintf(xattr_buf, local->alloc_len, "%s", local->xattr_val); + } else { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GET_XATTR_FAILED, + "Unknown local->xsel (%s)", local->xsel); + GF_FREE(xattr_buf); + goto out; + } + + ret = dict_set_dynstr(*dict, local->xsel, xattr_buf); + if (ret) + GF_FREE(xattr_buf); + GF_FREE(local->xattr_val); - return 0; +out: + return ret; } +static int +dht_find_local_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *prev = NULL; + int this_call_cnt = 0; + int ret = 0; + char *uuid_str = NULL; + char *uuid_list = NULL; + char *next_uuid_str = NULL; + char *saveptr = NULL; + uuid_t node_uuid = { + 0, + }; + char *uuid_list_copy = NULL; + int count = 0; + int i = 0; + int index = 0; + int found = 0; + nodeuuid_info_t *tmp_ptr = NULL; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(frame->local, out); + + local = frame->local; + prev = cookie; + conf = this->private; + + VALIDATE_OR_GOTO(conf->defrag, out); + + gf_msg_debug(this->name, 0, "subvol %s returned", prev->name); + + LOCK(&frame->lock); + { + this_call_cnt = --local->call_cnt; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; + UNLOCK(&frame->lock); + if (op_errno == ENODATA) + gf_msg_debug(this->name, 0, "failed to get node-uuid"); + else + gf_msg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_GET_XATTR_FAILED, "failed to get node-uuid"); + goto post_unlock; + } + + ret = dict_get_str(xattr, local->xsel, &uuid_list); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_GET_FAILED, + "Failed to get %s", local->xsel); + local->op_ret = -1; + local->op_errno = EINVAL; + goto unlock; + } -int -dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; + /* As DHT will not know details of its child xlators + * we need to parse this twice to get the count first + * and allocate memory later. + */ + count = 0; + index = conf->local_subvols_cnt; + uuid_list_copy = gf_strdup(uuid_list); + if (!uuid_list_copy) + goto unlock; - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + for (uuid_str = strtok_r(uuid_list, " ", &saveptr); uuid_str; + uuid_str = next_uuid_str) { + next_uuid_str = strtok_r(NULL, " ", &saveptr); + if (gf_uuid_parse(uuid_str, node_uuid)) { + local->op_ret = -1; + local->op_errno = EINVAL; + UNLOCK(&frame->lock); + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UUID_PARSE_ERROR, + "Failed to parse uuid for %s", prev->name); + goto post_unlock; + } + + count++; + if (gf_uuid_compare(node_uuid, conf->defrag->node_uuid)) { + gf_msg_debug(this->name, 0, + "subvol %s does not" + "belong to this node", + prev->name); + } else { + /* handle multiple bricks of the same replica + * on the same node */ + if (found) + continue; + conf->local_subvols[(conf->local_subvols_cnt)++] = prev; + found = 1; + gf_msg_debug(this->name, 0, + "subvol %s belongs to" + " this node", + prev->name); + } + } + + if (!found) { + local->op_ret = 0; + goto unlock; + } + + conf->local_nodeuuids[index].count = count; + conf->local_nodeuuids[index].elements = GF_CALLOC( + count, sizeof(nodeuuid_info_t), 1); + + /* The node-uuids are guaranteed to be returned in the same + * order as the bricks + * A null node-uuid is returned for a brick that is down. + */ - local->op_ret = 0; + saveptr = NULL; + i = 0; + + for (uuid_str = strtok_r(uuid_list_copy, " ", &saveptr); uuid_str; + uuid_str = next_uuid_str) { + next_uuid_str = strtok_r(NULL, " ", &saveptr); + tmp_ptr = &(conf->local_nodeuuids[index].elements[i]); + gf_uuid_parse(uuid_str, tmp_ptr->uuid); + + if (!gf_uuid_compare(tmp_ptr->uuid, conf->defrag->node_uuid)) { + tmp_ptr->info = REBAL_NODEUUID_MINE; + } + i++; + tmp_ptr = NULL; } + } + + local->op_ret = 0; unlock: - UNLOCK (&frame->lock); + UNLOCK(&frame->lock); +post_unlock: + if (!is_last_call(this_call_cnt)) + goto out; - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (setxattr, frame, local->op_ret, local->op_errno); - } + if (local->op_ret == -1) { + goto unwind; + } - return 0; -} + DHT_STACK_UNWIND(getxattr, frame, 0, 0, xattr, xdata); + goto out; +unwind: -int -dht_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask) + GF_FREE(conf->local_nodeuuids[index].elements); + conf->local_nodeuuids[index].elements = NULL; + + DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, xdata); +out: + GF_FREE(uuid_list_copy); + return 0; +} + +static int +dht_vgetxattr_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + int ret = 0; + dht_local_t *local = NULL; + int this_call_cnt = 0; + dict_t *dict = NULL; + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(frame->local, out); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + local = frame->local; - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; + LOCK(&frame->lock); + { + this_call_cnt = --local->call_cnt; + if (op_ret < 0) { + if (op_errno != ENOTCONN) { + local->op_ret = -1; + local->op_errno = op_errno; + UNLOCK(&frame->lock); + gf_msg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_GET_XATTR_FAILED, "getxattr err for dir"); + goto post_unlock; + } + + goto unlock; } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; + ret = dht_vgetxattr_alloc_and_fill(local, xattr, this, op_errno); + if (ret) { + UNLOCK(&frame->lock); + gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_DICT_SET_FAILED, + "alloc or fill failure"); + goto post_unlock; } + } +unlock: + UNLOCK(&frame->lock); +post_unlock: + if (!is_last_call(this_call_cnt)) + goto out; - local->call_cnt = 1; + /* -- last call: do patch ups -- */ - STACK_WIND (frame, dht_err_cbk, - subvol, subvol->fops->access, - loc, mask); + if (local->op_ret == -1) { + goto unwind; + } - return 0; + ret = dht_vgetxattr_fill_and_set(local, &dict, this, _gf_true); + if (ret) + goto unwind; -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (access, frame, -1, op_errno); + DHT_STACK_UNWIND(getxattr, frame, 0, 0, dict, xdata); + goto cleanup; - return 0; +unwind: + DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, NULL); +cleanup: + if (dict) + dict_unref(dict); +out: + return 0; } - -int -dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, const char *path, struct iatt *sbuf) +static int +dht_vgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xattr, dict_t *xdata) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; + int ret = 0; + dict_t *dict = NULL; + xlator_t *prev = NULL; + gf_boolean_t flag = _gf_true; + + local = frame->local; + prev = cookie; + + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_GET_XATTR_FAILED, + "vgetxattr: Subvolume %s returned -1", prev->name); + goto unwind; + } + + ret = dht_vgetxattr_alloc_and_fill(local, xattr, this, op_errno); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY, + "Allocation or fill failure"); + goto unwind; + } + + flag = (local->layout->cnt > 1) ? _gf_true : _gf_false; + + ret = dht_vgetxattr_fill_and_set(local, &dict, this, flag); + if (ret) + goto unwind; + + DHT_STACK_UNWIND(getxattr, frame, 0, 0, dict, xdata); + goto cleanup; - local = frame->local; - if (op_ret == -1) - goto err; +unwind: + DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, NULL); +cleanup: + if (dict) + dict_unref(dict); - if (local) { - sbuf->ia_ino = local->ia_ino; - } else { - op_ret = -1; - op_errno = EINVAL; + return 0; +} + +static int +dht_linkinfo_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) +{ + int ret = 0; + char *value = NULL; + + if (op_ret != -1) { + ret = dict_get_str(xattr, GF_XATTR_PATHINFO_KEY, &value); + if (!ret) { + ret = dict_set_str(xattr, GF_XATTR_LINKINFO_KEY, value); + if (!ret) + gf_msg_trace(this->name, 0, "failed to set linkinfo"); } + } -err: - DHT_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, sbuf); + DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata); - return 0; + return 0; } +static int +dht_mds_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(frame->local, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + local = frame->local; + + if (!xattr || (op_ret == -1)) { + local->op_ret = op_ret; + goto out; + } + dict_del(xattr, conf->xattr_name); + local->op_ret = 0; + + if (!local->xattr) { + local->xattr = dict_copy_with_ref(xattr, NULL); + } + +out: + DHT_STACK_UNWIND(getxattr, frame, local->op_ret, op_errno, local->xattr, + xdata); + return 0; +err: + DHT_STACK_UNWIND(getxattr, frame, -1, EINVAL, NULL, NULL); + return 0; +} int -dht_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size) +dht_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xattr, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + int this_call_cnt = 0; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = 0; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(frame->local, err); + VALIDATE_OR_GOTO(this->private, err); - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; + conf = this->private; + local = frame->local; + + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto err; + return 0; + } + + LOCK(&frame->lock); + { + if (!xattr || (op_ret == -1)) { + local->op_ret = op_ret; + goto unlock; } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; + dict_del(xattr, conf->xattr_name); + dict_del(xattr, conf->mds_xattr_key); + + dict_del(xattr, conf->commithash_xattr_name); + + if (frame->root->pid >= 0) { + GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr); + GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr); } - local->ia_ino = loc->inode->ino; + local->op_ret = 0; - STACK_WIND (frame, dht_readlink_cbk, - subvol, subvol->fops->readlink, - loc, size); + if (!local->xattr) { + local->xattr = dict_copy_with_ref(xattr, NULL); + } else { + dht_aggregate_xattr(local->xattr, xattr); + } - return 0; + if (!local->xdata) { + local->xdata = dict_ref(xdata); + } else if ((local->inode && IA_ISDIR(local->inode->ia_type)) || + (local->fd && IA_ISDIR(local->fd->inode->ia_type))) { + dht_aggregate_xattr(local->xdata, xdata); + } + } +unlock: + UNLOCK(&frame->lock); -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL); + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + /* If we have a valid xattr received from any one of the + * subvolume, let's return it */ + if (local->xattr) { + local->op_ret = 0; + } - return 0; + DHT_STACK_UNWIND(getxattr, frame, local->op_ret, op_errno, local->xattr, + local->xdata); + } + return 0; +err: + DHT_STACK_UNWIND(getxattr, frame, -1, EINVAL, NULL, NULL); + return 0; } - -int -dht_fix_layout_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +static int32_t +dht_getxattr_unwind(call_frame_t *frame, int op_ret, int op_errno, dict_t *dict, + dict_t *xdata) { - DHT_STACK_UNWIND (getxattr, frame, -1, ENODATA, NULL); - - return 0; + DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; } -static void -fill_layout_info (dht_layout_t *layout, char *buf) +static int +dht_getxattr_get_real_filename_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) { - int i = 0; - char tmp_buf[128] = {0,}; + int this_call_cnt = 0; + dht_local_t *local = NULL; - for (i = 0; i < layout->cnt; i++) { - snprintf (tmp_buf, 128, "(%s %u %u)", - layout->list[i].xlator->name, - layout->list[i].start, - layout->list[i].stop); - if (i) - strcat (buf, " "); - strcat (buf, tmp_buf); - } -} + local = frame->local; -int -dht_pathinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) -{ - dht_local_t *local = NULL; - int ret = 0; - int flag = 0; - int this_call_cnt = 0; - char *value_got = NULL; - char layout_buf[8192] = {0,}; - char xattr_buf[8192 + 1024] = {0,}; - dict_t *dict = NULL; + LOCK(&frame->lock); + { + if (local->op_errno == EOPNOTSUPP) { + /* Nothing to do here, we have already found + * a subvol which does not have the get_real_filename + * optimization. If condition is for simple logic. + */ + goto unlock; + } - local = frame->local; + if (op_ret == -1) { + if (op_errno == EOPNOTSUPP) { + /* This subvol does not have the optimization. + * Better let the user know we don't support it. + * Remove previous results if any. + */ - if (op_ret != -1) { - ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value_got); - if (!ret) { - if (!local->pathinfo) - local->pathinfo = GF_CALLOC (8192, sizeof (char), - gf_common_mt_char); - if (local->pathinfo) - strcat (local->pathinfo, value_got); + if (local->xattr) { + dict_unref(local->xattr); + local->xattr = NULL; } - } - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - if (local->layout->cnt > 1) { - /* Set it for directory */ - fill_layout_info (local->layout, layout_buf); - flag = 1; + if (local->xattr_req) { + dict_unref(local->xattr_req); + local->xattr_req = NULL; } - dict = dict_new (); + local->op_ret = op_ret; + local->op_errno = op_errno; + UNLOCK(&frame->lock); + gf_msg(this->name, GF_LOG_WARNING, op_errno, + DHT_MSG_UPGRADE_BRICKS, + "At least " + "one of the bricks does not support " + "this operation. Please upgrade all " + "bricks."); + goto post_unlock; + } + + if (op_errno == ENOATTR) { + /* Do nothing, our defaults are set to this. + */ + goto unlock; + } + + /* This is a place holder for every other error + * case. I am not sure of how to interpret + * ENOTCONN etc. As of now, choosing to ignore + * down subvol and return a good result(if any) + * from other subvol. + */ + UNLOCK(&frame->lock); + gf_msg(this->name, GF_LOG_WARNING, op_errno, + DHT_MSG_GET_XATTR_FAILED, "Failed to get real filename."); + goto post_unlock; + } + + /* This subvol has the required file. + * There could be other subvols which have returned + * success already, choosing to return the latest good + * result. + */ + if (local->xattr) + dict_unref(local->xattr); + local->xattr = dict_ref(xattr); + + if (local->xattr_req) { + dict_unref(local->xattr_req); + local->xattr_req = NULL; + } + if (xdata) + local->xattr_req = dict_ref(xdata); + + local->op_ret = op_ret; + local->op_errno = 0; + UNLOCK(&frame->lock); + gf_msg_debug(this->name, 0, "Found a matching file."); + goto post_unlock; + } +unlock: + UNLOCK(&frame->lock); +post_unlock: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + DHT_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, + local->xattr, local->xattr_req); + } + + return 0; +} - if (flag && local->pathinfo) - snprintf (xattr_buf, 9216, "((%s %s) (%s-layout %s))", - this->name, local->pathinfo, this->name, - layout_buf); - else if (local->pathinfo) - snprintf (xattr_buf, 9216, "(%s %s)", - this->name, local->pathinfo); - else if (flag) - snprintf (xattr_buf, 9216, "(%s-layout %s)", - this->name, layout_buf); +static int +dht_getxattr_get_real_filename(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *xdata) +{ + dht_local_t *local = NULL; + int i = 0; + dht_layout_t *layout = NULL; + int cnt = 0; + xlator_t *subvol = NULL; - ret = dict_set_str (dict, GF_XATTR_PATHINFO_KEY, - xattr_buf); + local = frame->local; + layout = local->layout; - if (local->pathinfo) - GF_FREE (local->pathinfo); + cnt = local->call_cnt = layout->cnt; - DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + local->op_ret = -1; + local->op_errno = ENOATTR; - if (dict) - dict_unref (dict); + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND(frame, dht_getxattr_get_real_filename_cbk, subvol, + subvol->fops->getxattr, loc, key, xdata); + } - return 0; - } + return 0; +} - if (local->pathinfo) - strcat (local->pathinfo, " Link: "); - if (local->hashed_subvol) { - /* This will happen if there pending */ - STACK_WIND (frame, dht_pathinfo_getxattr_cbk, local->hashed_subvol, - local->hashed_subvol->fops->getxattr, - &local->loc, local->key); +static int +dht_marker_populate_args(call_frame_t *frame, int type, int *gauge, + xlator_t **subvols) +{ + dht_local_t *local = NULL; + int i = 0; + dht_layout_t *layout = NULL; - return 0; - } + local = frame->local; + layout = local->layout; - gf_log ("this->name", GF_LOG_ERROR, "Unable to find hashed_subvol for path" - " %s", local->pathinfo); + for (i = 0; i < layout->cnt; i++) + subvols[i] = layout->list[i].xlator; - DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, dict); - return 0; + return layout->cnt; } -int -dht_linkinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) -{ - int ret = 0; - char *value = NULL; - - if (op_ret != -1) { - ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value); - if (!ret) { - ret = dict_set_str (xattr, GF_XATTR_LINKINFO_KEY, value); - if (!ret) - gf_log (this->name, GF_LOG_TRACE, - "failed to set linkinfo"); - } - } +static int +dht_is_debug_xattr_key(const char **array, char *key) +{ + int i = 0; - DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr); + for (i = 0; array[i]; i++) { + if (fnmatch(array[i], key, FNM_NOESCAPE) == 0) + return i; + } - return 0; + return -1; } -int -dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) +/* Note we already have frame->local initialised here*/ + +static int +dht_handle_debug_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key) { - int this_call_cnt = 0; - dht_local_t *local = NULL; + dht_local_t *local = NULL; + int ret = -1; + int op_errno = ENODATA; + char *value = NULL; + loc_t file_loc = {0}; + const char *name = NULL; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (frame->local, out); + local = frame->local; - local = frame->local; + if (dht_is_debug_xattr_key(dht_dbg_vxattrs, (char *)key) == -1) { + goto out; + } - this_call_cnt = dht_frame_return (frame); + local->xattr = dict_new(); + if (!local->xattr) { + op_errno = ENOMEM; + goto out; + } - if (!xattr || (op_ret == -1)) - goto out; + if (strncmp(key, DHT_DBG_HASHED_SUBVOL_KEY, + SLEN(DHT_DBG_HASHED_SUBVOL_KEY)) == 0) { + name = key + strlen(DHT_DBG_HASHED_SUBVOL_KEY); + if (strlen(name) == 0) { + op_errno = EINVAL; + goto out; + } - if (dict_get (xattr, "trusted.glusterfs.dht")) { - dict_del (xattr, "trusted.glusterfs.dht"); + ret = dht_build_child_loc(this, &file_loc, loc, (char *)name); + if (ret) { + op_errno = ENOMEM; + goto out; } - local->op_ret = 0; - if (!local->xattr) { - local->xattr = dict_copy_with_ref (xattr, NULL); - } else { - /* first aggregate everything into xattr and then copy into - * local->xattr. - */ - dht_aggregate_xattr (xattr, local->xattr); - local->xattr = dict_copy (xattr, local->xattr); + local->hashed_subvol = dht_subvol_get_hashed(this, &file_loc); + if (local->hashed_subvol == NULL) { + op_errno = ENODATA; + goto out; } -out: - if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, local->xattr); + + value = gf_strdup(local->hashed_subvol->name); + if (!value) { + op_errno = ENOMEM; + goto out; } - return 0; + + ret = dict_set_dynstr(local->xattr, (char *)key, value); + if (ret < 0) { + op_errno = -ret; + ret = -1; + goto out; + } + ret = 0; + goto out; + } + +out: + loc_wipe(&file_loc); + DHT_STACK_UNWIND(getxattr, frame, ret, op_errno, local->xattr, NULL); + return 0; } -int32_t -dht_getxattr_unwind (call_frame_t *frame, - int op_ret, int op_errno, dict_t *dict) +/* Virtual Xattr which returns 1 if all subvols are up, + else returns 0. Geo-rep then uses this virtual xattr + after a fresh mount and starts the I/O. +*/ + +enum dht_vxattr_subvol { + DHT_VXATTR_SUBVOLS_UP = 1, + DHT_VXATTR_SUBVOLS_DOWN = 0, +}; + +int +dht_vgetxattr_subvol_status(call_frame_t *frame, xlator_t *this, + const char *key) { - DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); - return 0; -} + dht_local_t *local = NULL; + int ret = -1; + int op_errno = ENODATA; + int value = DHT_VXATTR_SUBVOLS_UP; + int i = 0; + dht_conf_t *conf = NULL; + + conf = this->private; + local = frame->local; + + if (!key) { + op_errno = EINVAL; + goto out; + } + local->xattr = dict_new(); + if (!local->xattr) { + op_errno = ENOMEM; + goto out; + } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->subvolume_status[i]) { + value = DHT_VXATTR_SUBVOLS_DOWN; + gf_msg_debug(this->name, 0, "subvol %s is down ", + conf->subvolumes[i]->name); + break; + } + } + ret = dict_set_int8(local->xattr, (char *)key, value); + if (ret < 0) { + op_errno = -ret; + ret = -1; + goto out; + } + ret = 0; +out: + DHT_STACK_UNWIND(getxattr, frame, ret, op_errno, local->xattr, NULL); + return 0; +} int -dht_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *key) -{ - xlator_t *subvol = NULL; - xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - xlator_t **sub_volumes = NULL; - int op_errno = -1; - int ret = 0; - int flag = 0; - int i = 0; - int cnt = 0; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (this->private, err); - - conf = this->private; - layout = dht_layout_get (this, loc->inode); - if (!layout) { - gf_log (this->name, GF_LOG_ERROR, - "layout is NULL"); - op_errno = ENOENT; - goto err; +dht_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key, + dict_t *xdata) +#define DHT_IS_DIR(layout) (layout->cnt > 1) +{ + xlator_t *subvol = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *mds_subvol = NULL; + xlator_t *cached_subvol = NULL; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int op_errno = -1; + int i = 0; + int cnt = 0; + char *node_uuid_key = NULL; + int ret = -1; + + GF_CHECK_XATTR_KEY_AND_GOTO(key, IO_THREADS_QUEUE_SIZE_KEY, op_errno, err); + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + local = dht_local_init(frame, loc, NULL, GF_FOP_GETXATTR); + if (!local) { + op_errno = ENOMEM; + + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_NULL, + "Layout is NULL"); + op_errno = ENOENT; + goto err; + } + + /* skip over code which is irrelevant without a valid key */ + if (!key) + goto no_key; + + local->key = gf_strdup(key); + if (!local->key) { + op_errno = ENOMEM; + goto err; + } + + if (strncmp(key, conf->mds_xattr_key, strlen(key)) == 0) { + op_errno = ENOTSUP; + goto err; + } + + if (strncmp(key, DHT_SUBVOL_STATUS_KEY, SLEN(DHT_SUBVOL_STATUS_KEY)) == 0) { + dht_vgetxattr_subvol_status(frame, this, key); + return 0; + } + + /* skip over code which is irrelevant if !DHT_IS_DIR(layout) */ + if (!DHT_IS_DIR(layout)) + goto no_dht_is_dir; + + if ((strncmp(key, GF_XATTR_GET_REAL_FILENAME_KEY, + SLEN(GF_XATTR_GET_REAL_FILENAME_KEY)) == 0) && + DHT_IS_DIR(layout)) { + dht_getxattr_get_real_filename(frame, this, loc, key, xdata); + return 0; + } + + if (!strcmp(key, GF_REBAL_FIND_LOCAL_SUBVOL)) { + ret = gf_asprintf(&node_uuid_key, "%s", GF_XATTR_LIST_NODE_UUIDS_KEY); + if (ret == -1 || !node_uuid_key) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY, + "Failed to copy node uuid key"); + op_errno = ENOMEM; + goto err; + } + (void)snprintf(local->xsel, sizeof(local->xsel), "%s", node_uuid_key); + cnt = local->call_cnt = conf->subvolume_cnt; + for (i = 0; i < cnt; i++) { + STACK_WIND_COOKIE(frame, dht_find_local_subvol_cbk, + conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->getxattr, loc, + node_uuid_key, xdata); + } + if (node_uuid_key) + GF_FREE(node_uuid_key); + return 0; + } + + if (!strcmp(key, GF_REBAL_OLD_FIND_LOCAL_SUBVOL)) { + ret = gf_asprintf(&node_uuid_key, "%s", GF_XATTR_NODE_UUID_KEY); + if (ret == -1 || !node_uuid_key) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY, + "Failed to copy node uuid key"); + op_errno = ENOMEM; + goto err; + } + (void)snprintf(local->xsel, sizeof(local->xsel), "%s", node_uuid_key); + cnt = local->call_cnt = conf->subvolume_cnt; + for (i = 0; i < cnt; i++) { + STACK_WIND_COOKIE(frame, dht_find_local_subvol_cbk, + conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->getxattr, loc, + node_uuid_key, xdata); + } + if (node_uuid_key) + GF_FREE(node_uuid_key); + return 0; + } + + /* for file use cached subvolume (obviously!): see if {} + * below + * for directory: + * wind to all subvolumes and exclude subvolumes which + * return ENOTCONN (in callback) + * + * NOTE: Don't trust inode here, as that may not be valid + * (until inode_link() happens) + */ + + if (XATTR_IS_PATHINFO(key) || (strcmp(key, GF_XATTR_NODE_UUID_KEY) == 0) || + (strcmp(key, GF_XATTR_LIST_NODE_UUIDS_KEY) == 0)) { + (void)snprintf(local->xsel, sizeof(local->xsel), "%s", key); + cnt = local->call_cnt = layout->cnt; + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND(frame, dht_vgetxattr_dir_cbk, subvol, + subvol->fops->getxattr, loc, key, xdata); } + return 0; + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; +no_dht_is_dir: + /* node-uuid or pathinfo for files */ + if (XATTR_IS_PATHINFO(key) || (strcmp(key, GF_XATTR_NODE_UUID_KEY) == 0)) { + cached_subvol = local->cached_subvol; + (void)snprintf(local->xsel, sizeof(local->xsel), "%s", key); + local->call_cnt = 1; + STACK_WIND_COOKIE(frame, dht_vgetxattr_cbk, cached_subvol, + cached_subvol, cached_subvol->fops->getxattr, loc, + key, xdata); - goto err; - } + return 0; + } - if (key && (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0)) { - hashed_subvol = dht_subvol_get_hashed (this, loc); - cached_subvol = dht_subvol_get_cached (this, loc->inode); + if (strcmp(key, GF_XATTR_LINKINFO_KEY) == 0) { + hashed_subvol = dht_subvol_get_hashed(this, loc); + if (!hashed_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Failed to get hashed subvol for %s", loc->path); + op_errno = EINVAL; + goto err; + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; + cached_subvol = dht_subvol_get_cached(this, loc->inode); + if (!cached_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_CACHED_SUBVOL_GET_FAILED, + "Failed to get cached subvol for %s", loc->path); + op_errno = EINVAL; + goto err; + } - goto err; - } + if (hashed_subvol == cached_subvol) { + op_errno = ENODATA; + goto err; + } - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; + STACK_WIND(frame, dht_linkinfo_getxattr_cbk, hashed_subvol, + hashed_subvol->fops->getxattr, loc, GF_XATTR_PATHINFO_KEY, + xdata); + return 0; + } - goto err; - } - local->key = gf_strdup (key); - if (!local->key) { - op_errno = ENOMEM; + if (dht_is_debug_xattr_key(dht_dbg_vxattrs, (char *)key) >= 0) { + dht_handle_debug_getxattr(frame, this, loc, key); + return 0; + } - goto err; - } - local->layout = layout; +no_key: + if (cluster_handle_marker_getxattr(frame, loc, key, conf->vol_uuid, + dht_getxattr_unwind, + dht_marker_populate_args) == 0) + return 0; - local->call_cnt = 1; - if (hashed_subvol != cached_subvol) { - local->call_cnt = 2; - local->hashed_subvol = hashed_subvol; + if (DHT_IS_DIR(layout)) { + local->call_cnt = conf->subvolume_cnt; + cnt = conf->subvolume_cnt; + ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol); + if (!mds_subvol) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Cannot determine MDS, fetching xattr %s randomly" + " from a subvol for path %s ", + key, loc->path); + } else { + /* TODO need to handle it, As of now we are + choosing availability instead of chossing + consistencty, in case of mds_subvol is + down winding a getxattr call on other subvol + and return xattr + */ + local->mds_subvol = mds_subvol; + for (i = 0; i < cnt; i++) { + if (conf->subvolumes[i] == mds_subvol) { + if (!conf->subvolume_status[i]) { + gf_msg(this->name, GF_LOG_INFO, 0, + DHT_MSG_HASHED_SUBVOL_DOWN, + "MDS %s is down for path" + " path %s so fetching xattr " + "%s randomly from a subvol ", + local->mds_subvol->name, loc->path, key); + ret = 1; + } } + } + } - STACK_WIND (frame, dht_pathinfo_getxattr_cbk, cached_subvol, - cached_subvol->fops->getxattr, loc, key); + if (!ret && key && local->mds_subvol && dht_match_xattr(key)) { + STACK_WIND(frame, dht_mds_getxattr_cbk, local->mds_subvol, + local->mds_subvol->fops->getxattr, loc, key, xdata); - return 0; - } - if (key && (strcmp (key, GF_XATTR_LINKINFO_KEY) == 0)) { - hashed_subvol = dht_subvol_get_hashed (this, loc); - cached_subvol = dht_subvol_get_cached (this, loc->inode); - if (hashed_subvol == cached_subvol) { - op_errno = ENODATA; - goto err; - } - if (hashed_subvol) { - STACK_WIND (frame, dht_linkinfo_getxattr_cbk, hashed_subvol, - hashed_subvol->fops->getxattr, loc, - GF_XATTR_PATHINFO_KEY); - return 0; - } - op_errno = ENODATA; - goto err; + return 0; } - if (key && (strcmp (key, GF_XATTR_FIX_LAYOUT_KEY) == 0)) { - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].start == layout->list[i].stop) { - flag = 1; - break; - } - } - if ((layout->cnt < conf->subvolume_cnt) || flag) { - gf_log (this->name, GF_LOG_INFO, - "expanding layout of %s from %d to %d", - loc->path, layout->cnt, conf->subvolume_cnt); - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - - goto err; - } + } else { + cnt = local->call_cnt = 1; + } - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->getxattr, loc, + key, xdata); + } + return 0; - goto err; - } - local->layout = layout; - //layout = dht_layout_new (this, conf->subvolume_cnt); +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL); - dht_selfheal_new_directory (frame, dht_fix_layout_cbk, - layout); - return 0; + return 0; +} +#undef DHT_IS_DIR + +int +dht_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int op_errno = -1; + int i = 0; + int cnt = 0; + xlator_t *mds_subvol = NULL; + int ret = -1; + dht_conf_t *conf = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + VALIDATE_OR_GOTO(fd->inode, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + local = dht_local_init(frame, NULL, fd, GF_FOP_FGETXATTR); + if (!local) { + op_errno = ENOMEM; + + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_NULL, + "Layout is NULL"); + op_errno = ENOENT; + goto err; + } + + if (key) { + local->key = gf_strdup(key); + if (!local->key) { + op_errno = ENOMEM; + goto err; + } + } + + gf_uuid_unparse(fd->inode->gfid, gfid); + + if ((fd->inode->ia_type == IA_IFDIR) && key && + (strncmp(key, GF_XATTR_LOCKINFO_KEY, SLEN(GF_XATTR_LOCKINFO_KEY)) != + 0)) { + local->call_cnt = conf->subvolume_cnt; + cnt = conf->subvolume_cnt; + ret = dht_inode_ctx_mdsvol_get(fd->inode, this, &mds_subvol); + + if (!mds_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "cannot determine MDS, fetching xattr %s " + " randomly from a subvol for gfid %s ", + key, gfid); + } else { + /* TODO need to handle it, As of now we are + choosing availability instead of chossing + consistencty, in case of hashed_subvol is + down winding a getxattr call on other subvol + and return xattr + */ + local->mds_subvol = mds_subvol; + for (i = 0; i < cnt; i++) { + if (conf->subvolumes[i] == mds_subvol) { + if (!conf->subvolume_status[i]) { + gf_msg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_HASHED_SUBVOL_DOWN, + "MDS subvolume %s is down" + " for gfid %s so fetching xattr " + " %s randomly from a subvol ", + local->mds_subvol->name, gfid, key); + ret = 1; + } } - op_errno = ENODATA; - goto err; + } } - if (key && (!strcmp (GF_XATTR_MARKER_KEY, key)) - && (-1 == frame->root->pid)) { - - if (loc->inode-> ia_type == IA_IFDIR) { - cnt = layout->cnt; - } else { - cnt = 1; - } - sub_volumes = alloca ( cnt * sizeof (xlator_t *)); - for (i = 0; i < cnt; i++) - *(sub_volumes + i) = layout->list[i].xlator; - - if (cluster_getmarkerattr (frame, this, loc, key, - local, dht_getxattr_unwind, - sub_volumes, cnt, - MARKER_UUID_TYPE, conf->vol_uuid)) { - op_errno = EINVAL; - goto err; - } + if (!ret && key && local->mds_subvol && dht_match_xattr(key)) { + STACK_WIND(frame, dht_mds_getxattr_cbk, local->mds_subvol, + local->mds_subvol->fops->fgetxattr, fd, key, NULL); - return 0; + return 0; } - if (key && *conf->vol_uuid) { - if ((match_uuid_local (key, conf->vol_uuid) == 0) && - (-1 == frame->root->pid)) { - if (loc->inode-> ia_type == IA_IFDIR) { - cnt = layout->cnt; - } else { - cnt = 1; - } - sub_volumes = alloca ( cnt * sizeof (xlator_t *)); - for (i = 0; i < cnt; i++) - sub_volumes[i] = layout->list[i].xlator; - - if (cluster_getmarkerattr (frame, this, loc, key, - local, dht_getxattr_unwind, - sub_volumes, cnt, - MARKER_XTIME_TYPE, - conf->vol_uuid)) { - op_errno = EINVAL; - goto err; - } + } else { + cnt = local->call_cnt = 1; + } - return 0; - } - } + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->fgetxattr, fd, + key, NULL); + } + return 0; - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL); - goto err; - } + return 0; +} - if (key) { - local->key = gf_strdup (key); - if (!local->key) { - op_errno = ENOMEM; +static int +dht_setxattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int op_errno = EINVAL; - goto err; - } - } - local->layout = layout; + if (!frame || !frame->local) + goto err; - if (loc->inode-> ia_type == IA_IFDIR) { - cnt = local->call_cnt = layout->cnt; - } else { - cnt = local->call_cnt = 1; - } + local = frame->local; + op_errno = local->op_errno; - for (i = 0; i < cnt; i++) { - subvol = layout->list[i].xlator; - STACK_WIND (frame, dht_getxattr_cbk, - subvol, subvol->fops->getxattr, - loc, key); - } + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno, + local->rebalance.xdata); return 0; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL); + if (subvol == NULL) + goto err; - return 0; + local->call_cnt = 2; /* This is the second attempt */ + + if (local->fop == GF_FOP_SETXATTR) { + STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol, + subvol->fops->setxattr, &local->loc, + local->rebalance.xattr, local->rebalance.flags, + local->xattr_req); + } else { + STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol, + subvol->fops->fsetxattr, local->fd, + local->rebalance.xattr, local->rebalance.flags, + local->xattr_req); + } + + return 0; + +err: + DHT_STACK_UNWIND(setxattr, frame, (local ? local->op_ret : -1), op_errno, + NULL); + return 0; } int -dht_fsetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *xattr, int flags) +dht_file_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - xlator_t *subvol = NULL; - dht_local_t *local = NULL; - int op_errno = EINVAL; + int ret = -1; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + struct iatt *stbuf = NULL; + inode_t *inode = NULL; + xlator_t *subvol1 = NULL, *subvol2 = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); + local = frame->local; + prev = cookie; - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + local->op_errno = op_errno; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; - } + if ((local->fop == GF_FOP_FSETXATTR) && + dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } - local->inode = inode_ref (fd->inode); - local->call_cnt = 1; + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1.", + prev->name); + goto out; + } - STACK_WIND (frame, dht_err_cbk, subvol, subvol->fops->fsetxattr, - fd, xattr, flags); + if (local->call_cnt != 1) + goto out; - return 0; + ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf); -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsetxattr, frame, -1, op_errno); + if ((!op_ret) && !stbuf) { + goto out; + } - return 0; -} + local->op_ret = op_ret; + local->rebalance.target_op_fn = dht_setxattr2; + if (xdata) + local->rebalance.xdata = dict_ref(xdata); + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } -int -dht_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr, int flags) -{ - xlator_t *subvol = NULL; - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - dht_layout_t *layout = NULL; - int i = 0; - int op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - conf = this->private; - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + /* Phase 1 of migration */ + if (IS_DHT_MIGRATION_PHASE1(stbuf)) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; + ret = dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2); + if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) { + dht_setxattr2(this, subvol2, frame, 0); + return 0; } - local->layout = layout = dht_layout_get (this, loc->inode); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for path=%s", loc->path); - op_errno = EINVAL; + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } + +out: + + if (local->fop == GF_FOP_SETXATTR) { + DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata); + } else { + DHT_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata); + } + + return 0; +} + +/* Function is call by dict_foreach_fnmatch if key is match with + user.* and set boolean flag to true +*/ +static int +dht_is_user_xattr(dict_t *this, char *key, data_t *value, void *data) +{ + gf_boolean_t *user_xattr_found = data; + *user_xattr_found = _gf_true; + return 0; +} + +/* Common code to wind a (f)(set|remove)xattr call to set xattr on directory + */ +static int +dht_dir_common_set_remove_xattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + fd_t *fd, dict_t *xattr, int flags, + dict_t *xdata, int *op_errno) + +{ + dict_t *xattrop = NULL; + int32_t subone[1] = {-1}; + gf_boolean_t uxattr_key_found = _gf_false; + xlator_t *mds_subvol = NULL; + xlator_t *travvol = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int i = 0; + int call_cnt = 0; + dht_local_t *local = NULL; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + char **xattrs_to_heal; + + conf = this->private; + local = frame->local; + call_cnt = conf->subvolume_cnt; + local->flags = flags; + xattrs_to_heal = get_xattrs_to_heal(); + + if (!gf_uuid_is_null(local->gfid)) { + gf_uuid_unparse(local->gfid, gfid_local); + } + + if ((local->fop == GF_FOP_SETXATTR) || (local->fop == GF_FOP_FSETXATTR)) { + /* Check if any user xattr present in xattr + */ + dict_foreach_fnmatch(xattr, "user*", dht_is_user_xattr, + &uxattr_key_found); + + /* Check if any custom key xattr present in dict xattr + and start index from 1 because user xattr already + checked in previous line + */ + for (i = 1; xattrs_to_heal[i]; i++) + if (dict_get(xattr, xattrs_to_heal[i])) + uxattr_key_found = _gf_true; + } + + if ((local->fop == GF_FOP_REMOVEXATTR) || + (local->fop == GF_FOP_FREMOVEXATTR)) { + /* Check if any custom key xattr present in local->key + */ + for (i = 0; xattrs_to_heal[i]; i++) + if (strstr(local->key, xattrs_to_heal[i])) + uxattr_key_found = _gf_true; + } + + /* If there is no custom key xattr present or gfid is root + or call_cnt is 1 then wind a (f)setxattr call on all subvols + */ + if (!uxattr_key_found || __is_root_gfid(local->gfid) || call_cnt == 1) { + for (i = 0; i < conf->subvolume_cnt; i++) { + travvol = conf->subvolumes[i]; + if ((local->fop == GF_FOP_SETXATTR) || + (local->fop == GF_FOP_FSETXATTR)) { + if (fd) { + STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol, + travvol->fops->fsetxattr, fd, xattr, + flags, xdata); + } else { + STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol, + travvol->fops->setxattr, loc, xattr, + flags, xdata); + } + } + + if ((local->fop == GF_FOP_REMOVEXATTR) || + (local->fop == GF_FOP_FREMOVEXATTR)) { + if (fd) { + STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol, + travvol->fops->fremovexattr, fd, + local->key, local->xattr_req); + } else { + STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol, + travvol->fops->removexattr, loc, + local->key, local->xattr_req); + } + } + } + + return 0; + } + + /* Calculate hash subvol based on inode and parent inode + */ + if (fd) { + ret = dht_inode_ctx_mdsvol_get(fd->inode, this, &mds_subvol); + } else { + ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol); + } + if (ret || !mds_subvol) { + if (fd) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Failed to get mds subvol for fd %p" + "gfid is %s ", + fd, gfid_local); + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "%s: Failed to get mds subvol. (gfid is %s)", loc->path, + gfid_local); + } + (*op_errno) = ENOENT; + goto err; + } + + local->mds_subvol = mds_subvol; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == mds_subvol) { + if (!conf->subvolume_status[i]) { + gf_msg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_HASHED_SUBVOL_DOWN, + "MDS subvol is down for path " + " %s gfid is %s Unable to set xattr ", + local->loc.path, gfid_local); + (*op_errno) = ENOTCONN; goto err; + } + } + } + + if (uxattr_key_found) { + xattrop = dict_new(); + if (!xattrop) { + gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, 0, + "dictionary creation failed for path %s " + "for gfid is %s ", + local->loc.path, gfid_local); + (*op_errno) = ENOMEM; + goto err; + } + local->xattr = dict_ref(xattr); + /* Subtract current MDS xattr value to -1 , value of MDS + xattr represents no. of times xattr modification failed + on non MDS subvols. + */ + ret = dht_dict_set_array(xattrop, conf->mds_xattr_key, subone, 1); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "dictionary set array failed for path %s " + "for gfid is %s ", + local->loc.path, gfid_local); + if (xattrop) + dict_unref(xattrop); + (*op_errno) = ret; + goto err; + } + /* Wind a xattrop call to use ref counting approach + update mds xattr to -1 before update xattr on + hashed subvol and update mds xattr to +1 after update + xattr on all non hashed subvol + */ + if (fd) { + STACK_WIND(frame, dht_xattrop_mds_cbk, local->mds_subvol, + local->mds_subvol->fops->fxattrop, fd, + GF_XATTROP_ADD_ARRAY, xattrop, NULL); + } else { + STACK_WIND(frame, dht_xattrop_mds_cbk, local->mds_subvol, + local->mds_subvol->fops->xattrop, loc, + GF_XATTROP_ADD_ARRAY, xattrop, NULL); } + if (xattrop) + dict_unref(xattrop); + } - local->call_cnt = layout->cnt; + return 0; +err: + return -1; +} - for (i = 0; i < layout->cnt; i++) { - STACK_WIND (frame, dht_err_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->setxattr, - loc, xattr, flags); +int +dht_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr, + int flags, dict_t *xdata) +{ + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + int op_errno = EINVAL; + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + int call_cnt = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + VALIDATE_OR_GOTO(fd->inode, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + if (!conf->defrag) + GF_IF_INTERNAL_XATTR_GOTO(conf->wild_xattr_name, xattr, op_errno, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_FSETXATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg_debug(this->name, 0, "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local->xattr_req = xdata ? dict_ref(xdata) : dict_new(); + local->call_cnt = call_cnt = layout->cnt; + + if (IA_ISDIR(fd->inode->ia_type)) { + local->hashed_subvol = NULL; + ret = dht_dir_common_set_remove_xattr(frame, this, NULL, fd, xattr, + flags, xdata, &op_errno); + if (ret) + goto err; + } else { + local->call_cnt = 1; + local->rebalance.xattr = dict_ref(xattr); + local->rebalance.flags = flags; + + ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1); + if (ret) { + gf_msg_debug(this->name, 0, + "Failed to set dictionary key %s for fd=%p", + DHT_IATT_IN_XDATA_KEY, fd); } - return 0; + STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol, + subvol->fops->fsetxattr, fd, xattr, flags, + local->xattr_req); + } + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (setxattr, frame, -1, op_errno); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL); - return 0; + return 0; } +static int +dht_checking_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) +{ + int i = -1; + int ret = -1; + char *value = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *prev = NULL; + int this_call_cnt = 0; + + local = frame->local; + prev = cookie; + conf = this->private; + + if (op_ret == -1) + goto out; + + ret = dict_get_str(xattr, GF_XATTR_PATHINFO_KEY, &value); + if (ret) + goto out; + + if (!strcmp(value, local->key)) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) + conf->decommissioned_bricks[i] = prev; + } + } + +out: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + DHT_STACK_UNWIND(setxattr, frame, local->op_ret, ENOTSUP, NULL); + } + return 0; +} -int -dht_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) +static int +dht_nuke_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; + STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, NULL); + return 0; +} - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } +static int +dht_nuke_dir(call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *tmp) +{ + if (!IA_ISDIR(loc->inode->ia_type)) { + DHT_STACK_UNWIND(setxattr, frame, -1, ENOTSUP, NULL); + return 0; + } - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); + /* Setxattr didn't need the parent, but rmdir does. */ + loc->parent = inode_parent(loc->inode, NULL, NULL); + if (!loc->parent) { + DHT_STACK_UNWIND(setxattr, frame, -1, ENOENT, NULL); + return 0; + } + gf_uuid_copy(loc->pargfid, loc->parent->gfid); - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (removexattr, frame, local->op_ret, local->op_errno); + if (!loc->name && loc->path) { + loc->name = strrchr(loc->path, '/'); + if (loc->name) { + ++(loc->name); } + } - return 0; -} + /* + * We do this instead of calling dht_rmdir_do directly for two reasons. + * The first is that we want to reuse all of the initialization that + * dht_rmdir does, so if it ever changes we'll just follow along. The + * second (i.e. why we don't use STACK_WIND_TAIL) is so that we don't + * obscure the fact that we came in via this path instead of a genuine + * rmdir. That makes debugging just a tiny bit easier. + */ + STACK_WIND(frame, dht_nuke_dir_cbk, this, this->fops->rmdir, loc, 1, NULL); + return 0; +} int -dht_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *key) +dht_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr, + int flags, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; + dht_layout_t *layout = NULL; + int i = 0; + int op_errno = EINVAL; + int ret = -1; + data_t *tmp = NULL; + uint32_t dir_spread = 0; + char value[4096] = { + 0, + }; + gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA; + int call_cnt = 0; + uint32_t new_hash = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, err); + + methods = &(conf->methods); + + /* Rebalance daemon is allowed to set internal keys */ + if (!conf->defrag) + GF_IF_INTERNAL_XATTR_GOTO(conf->wild_xattr_name, xattr, op_errno, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_SETXATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local->call_cnt = call_cnt = layout->cnt; + tmp = dict_get(xattr, conf->mds_xattr_key); + if (tmp) { + op_errno = ENOTSUP; + goto err; + } + + tmp = dict_get(xattr, GF_XATTR_FILE_MIGRATE_KEY); + if (tmp) { + if (IA_ISDIR(loc->inode->ia_type)) { + op_errno = ENOTSUP; + goto err; + } + + /* TODO: need to interpret the 'value' for more meaning + (ie, 'target' subvolume given there, etc) */ + memcpy(value, tmp->data, tmp->len); + if (strcmp(value, "force") == 0) + forced_rebalance = GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS; + + if (conf->decommission_in_progress) + forced_rebalance = GF_DHT_MIGRATE_HARDLINK; + + if (!loc->path) { + op_errno = EINVAL; + goto err; + } + + if (!local->loc.name) + local->loc.name = strrchr(local->loc.path, '/') + 1; + + if (!local->loc.parent) + local->loc.parent = inode_parent(local->loc.inode, NULL, NULL); + + if ((!local->loc.name) || (!local->loc.parent)) { + op_errno = EINVAL; + goto err; + } + + if (gf_uuid_is_null(local->loc.pargfid)) + gf_uuid_copy(local->loc.pargfid, local->loc.parent->gfid); + + methods->migration_get_dst_subvol(this, local); + + if (!local->rebalance.target_node) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Failed to get hashed subvol for %s", loc->path); + op_errno = EINVAL; + goto err; + } + + local->rebalance.from_subvol = local->cached_subvol; + + if (local->rebalance.target_node == local->rebalance.from_subvol) { + op_errno = EEXIST; + goto err; + } + if (local->rebalance.target_node) { + local->flags = forced_rebalance; - int i; + frame->root->pid = GF_CLIENT_PID_DEFRAG; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + ret = dht_start_rebalance_task(this, frame); + if (!ret) + return 0; - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_START_FAILED, + "%s: failed to create a new rebalance synctask", loc->path); } + op_errno = EINVAL; + goto err; + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; + tmp = dict_get(xattr, "decommission-brick"); + if (tmp) { + /* This operation should happen only on '/' */ + if (!__is_root_gfid(loc->inode->gfid)) { + op_errno = ENOTSUP; + goto err; } - local->layout = layout = dht_layout_get (this, loc->inode); - if (!local->layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for path=%s", loc->path); - op_errno = EINVAL; + memcpy(value, tmp->data, min(tmp->len, 4095)); + local->key = gf_strdup(value); + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + /* Get the pathinfo, and then compare */ + STACK_WIND_COOKIE(frame, dht_checking_pathinfo_cbk, + conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->getxattr, loc, + GF_XATTR_PATHINFO_KEY, NULL); + } + return 0; + } + + tmp = dict_get(xattr, GF_XATTR_FIX_LAYOUT_KEY); + if (tmp) { + ret = dict_get_uint32(xattr, "new-commit-hash", &new_hash); + if (ret == 0) { + gf_msg_debug(this->name, 0, + "updating commit hash for %s from %u to %u", + uuid_utoa(loc->gfid), layout->commit_hash, new_hash); + layout->commit_hash = new_hash; + + ret = dht_update_commit_hash_for_layout(frame); + if (ret) { + op_errno = ENOTCONN; goto err; + } + return ret; } - local->call_cnt = layout->cnt; + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_FIX_LAYOUT_INFO, + "fixing the layout of %s", loc->path); - for (i = 0; i < layout->cnt; i++) { - STACK_WIND (frame, dht_removexattr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->removexattr, - loc, key); + ret = dht_fix_directory_layout(frame, dht_fix_layout_setxattr_cbk, + layout); + if (ret) { + op_errno = ENOTCONN; + goto err; } + return ret; + } + + tmp = dict_get(xattr, "distribute.directory-spread-count"); + if (tmp) { + /* Setxattr value is packed as 'binary', not string */ + memcpy(value, tmp->data, min(tmp->len, 4095)); + ret = gf_string2uint32(value, &dir_spread); + if (!ret && ((dir_spread <= conf->subvolume_cnt) && (dir_spread > 0))) { + layout->spread_cnt = dir_spread; + + ret = dht_fix_directory_layout(frame, dht_common_setxattr_cbk, + layout); + if (ret) { + op_errno = ENOTCONN; + goto err; + } + return ret; + } + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_OPERATION_NOT_SUP, + "wrong 'directory-spread-count' value (%s)", value); + op_errno = ENOTSUP; + goto err; + } + + tmp = dict_get(xattr, "glusterfs.dht.nuke"); + if (tmp) { + return dht_nuke_dir(frame, this, loc, tmp); + } + local->xattr_req = xdata ? dict_ref(xdata) : dict_new(); + + if (IA_ISDIR(loc->inode->ia_type)) { + local->hashed_subvol = NULL; + ret = dht_dir_common_set_remove_xattr(frame, this, loc, NULL, xattr, + flags, xdata, &op_errno); + if (ret) + goto err; + } else { + local->rebalance.xattr = dict_ref(xattr); + local->rebalance.flags = flags; + local->call_cnt = 1; - return 0; + ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1); + + STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol, + subvol->fops->setxattr, loc, xattr, flags, + local->xattr_req); + } + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (removexattr, frame, -1, op_errno); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL); - return 0; + return 0; } - -int -dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) +static int +dht_removexattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - + dht_local_t *local = NULL; + int op_errno = EINVAL; - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + if (!frame || !frame->local) + goto err; - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); + local = frame->local; + op_errno = local->op_errno; - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, - local->fd); + local->call_cnt = 2; /* This is the second attempt */ + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno, + local->rebalance.xdata); return 0; -} + } + + if (subvol == NULL) + goto err; + + if (local->fop == GF_FOP_REMOVEXATTR) { + STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol, + subvol->fops->removexattr, &local->loc, local->key, + local->xattr_req); + } else { + STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol, + subvol->fops->fremovexattr, local->fd, local->key, + local->xattr_req); + } + + return 0; +err: + DHT_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL); + return 0; +} int -dht_open (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags, fd_t *fd, int wbflags) +dht_file_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - xlator_t *subvol = NULL; - int ret = -1; - int op_errno = -1; - dht_local_t *local = NULL; + int ret = -1; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + struct iatt *stbuf = NULL; + inode_t *inode = NULL; + xlator_t *subvol1 = NULL, *subvol2 = NULL; + local = frame->local; + prev = cookie; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + local->op_errno = op_errno; - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + if ((local->fop == GF_FOP_FREMOVEXATTR) && + dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto out; + } - goto err; - } + if (local->call_cnt != 1) + goto out; - local->fd = fd_ref (fd); - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; + ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf); - goto err; - } + if ((!op_ret) && !stbuf) { + goto out; + } - local->call_cnt = 1; + local->op_ret = 0; - STACK_WIND (frame, dht_fd_cbk, - subvol, subvol->fops->open, - loc, flags, fd, wbflags); + local->rebalance.target_op_fn = dht_removexattr2; + if (xdata) + local->rebalance.xdata = dict_ref(xdata); - return 0; + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (open, frame, -1, op_errno, NULL); + /* Phase 1 of migration */ + if (IS_DHT_MIGRATION_PHASE1(stbuf)) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; - return 0; -} + ret = dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2); + if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) { + dht_removexattr2(this, subvol2, frame, 0); + return 0; + } + + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } +out: + if (local->fop == GF_FOP_REMOVEXATTR) { + DHT_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata); + } else { + DHT_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata); + } + return 0; +} int -dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - struct iovec *vector, int count, struct iatt *stbuf, - struct iobref *iobref) +dht_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *xdata) { - dht_local_t *local = frame->local; + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int call_cnt = 0; + dht_conf_t *conf = NULL; + int ret = 0; + + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + GF_IF_NATIVE_XATTR_GOTO(conf->wild_xattr_name, key, op_errno, err); + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_REMOVEXATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + layout = local->layout; + if (!local->layout) { + gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + + local->call_cnt = call_cnt = layout->cnt; + local->key = gf_strdup(key); + + if (key && (strncmp(key, conf->mds_xattr_key, strlen(key)) == 0)) { + op_errno = ENOTSUP; + goto err; + } + + if (IA_ISDIR(loc->inode->ia_type)) { + local->hashed_subvol = NULL; + ret = dht_dir_common_set_remove_xattr(frame, this, loc, NULL, NULL, 0, + local->xattr_req, &op_errno); + if (ret) + goto err; - if (!local) { - op_ret = -1; - op_errno = EINVAL; - goto out; + } else { + local->call_cnt = 1; + ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to " + "set dictionary key %s for %s", + DHT_IATT_IN_XDATA_KEY, loc->path); } - if (op_ret != -1) - stbuf->ia_ino = local->ia_ino; -out: - DHT_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count, stbuf, - iobref); + STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol, + subvol->fops->removexattr, loc, key, + local->xattr_req); + } - return 0; -} + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL); + return 0; +} int -dht_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off) +dht_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, + dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int call_cnt = 0; + dht_conf_t *conf = 0; + int ret = 0; + + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + GF_IF_NATIVE_XATTR_GOTO(conf->wild_xattr_name, key, op_errno, err); + + VALIDATE_OR_GOTO(frame, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_FREMOVEXATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for inode=%s", + uuid_utoa(fd->inode->gfid)); + op_errno = EINVAL; + goto err; + } + + layout = local->layout; + if (!local->layout) { + gf_msg_debug(this->name, 0, "no layout for inode=%s", + uuid_utoa(fd->inode->gfid)); + op_errno = EINVAL; + goto err; + } + local->xattr_req = xdata ? dict_ref(xdata) : dict_new(); + + local->call_cnt = call_cnt = layout->cnt; + local->key = gf_strdup(key); + + if (IA_ISDIR(fd->inode->ia_type)) { + local->hashed_subvol = NULL; + ret = dht_dir_common_set_remove_xattr(frame, this, NULL, fd, NULL, 0, + local->xattr_req, &op_errno); + if (ret) + goto err; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; + } else { + local->call_cnt = 1; + ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to " + "set dictionary key %s for fd=%p", + DHT_IATT_IN_XDATA_KEY, fd); } - local->ia_ino = fd->inode->ino; - STACK_WIND (frame, dht_readv_cbk, - subvol, subvol->fops->readv, - fd, size, off); + STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol, + subvol->fops->fremovexattr, fd, key, + local->xattr_req); + } - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL); - return 0; + return 0; } - int -dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *prebuf, - struct iatt *postbuf) +dht_fd_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, fd_t *fd, dict_t *xdata) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + local = frame->local; + prev = cookie; + + LOCK(&frame->lock); + { if (op_ret == -1) { - goto out; + local->op_errno = op_errno; + UNLOCK(&frame->lock); + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto post_unlock; } - local = frame->local; - if (!local) { - op_ret = -1; - op_errno = EINVAL; - goto out; + local->op_ret = 0; + } + UNLOCK(&frame->lock); +post_unlock: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) + DHT_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, local->fd, + NULL); + + return 0; +} + +/* + * dht_normalize_stats - + */ +static void +dht_normalize_stats(struct statvfs *buf, unsigned long bsize, + unsigned long frsize) +{ + double factor = 0; + + if (buf->f_bsize != bsize) { + buf->f_bsize = bsize; + } + + if (buf->f_frsize != frsize) { + factor = ((double)buf->f_frsize) / frsize; + buf->f_frsize = frsize; + buf->f_blocks = (fsblkcnt_t)(factor * buf->f_blocks); + buf->f_bfree = (fsblkcnt_t)(factor * buf->f_bfree); + buf->f_bavail = (fsblkcnt_t)(factor * buf->f_bavail); + } +} + +static int +dht_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *statvfs, dict_t *xdata) +{ + gf_boolean_t event = _gf_false; + qdstatfs_action_t action = qdstatfs_action_OFF; + dht_local_t *local = NULL; + int this_call_cnt = 0; + int bsize = 0; + int frsize = 0; + GF_UNUSED int ret = 0; + unsigned long new_usage = 0; + unsigned long cur_usage = 0; + + local = frame->local; + GF_ASSERT(local); + + if (xdata) + ret = dict_get_int8(xdata, "quota-deem-statfs", (int8_t *)&event); + + LOCK(&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + goto unlock; + } + if (!statvfs) { + op_errno = EINVAL; + local->op_ret = -1; + goto unlock; + } + local->op_ret = 0; + + if (local->quota_deem_statfs) { + if (event == _gf_true) { + action = qdstatfs_action_COMPARE; + } else { + action = qdstatfs_action_NEGLECT; + } + } else { + if (event == _gf_true) { + action = qdstatfs_action_REPLACE; + local->quota_deem_statfs = _gf_true; + } } - prebuf->ia_ino = local->ia_ino; - postbuf->ia_ino = local->ia_ino; + if (local->quota_deem_statfs) { + switch (action) { + case qdstatfs_action_NEGLECT: + goto unlock; -out: - DHT_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf); + case qdstatfs_action_REPLACE: + local->statvfs = *statvfs; + goto unlock; - return 0; -} + case qdstatfs_action_COMPARE: + new_usage = statvfs->f_blocks - statvfs->f_bfree; + cur_usage = local->statvfs.f_blocks - + local->statvfs.f_bfree; + /* Take the max of the usage from subvols */ + if (new_usage >= cur_usage) + local->statvfs = *statvfs; + goto unlock; + + default: + break; + } + } + + if (local->statvfs.f_bsize != 0) { + bsize = max(local->statvfs.f_bsize, statvfs->f_bsize); + frsize = max(local->statvfs.f_frsize, statvfs->f_frsize); + dht_normalize_stats(&local->statvfs, bsize, frsize); + dht_normalize_stats(statvfs, bsize, frsize); + } else { + local->statvfs.f_bsize = statvfs->f_bsize; + local->statvfs.f_frsize = statvfs->f_frsize; + } + + local->statvfs.f_blocks += statvfs->f_blocks; + local->statvfs.f_bfree += statvfs->f_bfree; + local->statvfs.f_bavail += statvfs->f_bavail; + local->statvfs.f_files += statvfs->f_files; + local->statvfs.f_ffree += statvfs->f_ffree; + local->statvfs.f_favail += statvfs->f_favail; + local->statvfs.f_fsid = statvfs->f_fsid; + local->statvfs.f_flag = statvfs->f_flag; + local->statvfs.f_namemax = statvfs->f_namemax; + } +unlock: + UNLOCK(&frame->lock); + + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) + DHT_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno, + &local->statvfs, xdata); + + return 0; +} int -dht_writev (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iovec *vector, int count, off_t off, - struct iobref *iobref) +dht_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; + inode_t *inode = NULL; + inode_table_t *itable = NULL; + static uuid_t root_gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + loc_t newloc = { + 0, + }; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(this->private, err); - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; + conf = this->private; + + local = dht_local_init(frame, NULL, NULL, GF_FOP_STATFS); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + if (loc->inode && !IA_ISDIR(loc->inode->ia_type)) { + itable = loc->inode->table; + if (!itable) { + op_errno = EINVAL; + goto err; } - local = dht_local_init (frame); - if (!local) { + loc = &local->loc2; - op_errno = ENOMEM; - goto err; + inode = inode_find(itable, root_gfid); + if (!inode) { + op_errno = EINVAL; + goto err; } - local->ia_ino = fd->inode->ino; + dht_build_root_loc(inode, &newloc); + loc = &newloc; + } - STACK_WIND (frame, dht_writev_cbk, - subvol, subvol->fops->writev, - fd, vector, count, off, iobref); + local->call_cnt = conf->subvolume_cnt; - return 0; + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND(frame, dht_statfs_cbk, conf->subvolumes[i], + conf->subvolumes[i]->fops->statfs, loc, xdata); + } + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } - int -dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +dht_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; + int ret = 0; + gf_boolean_t new_xdata = _gf_false; + xlator_t **subvolumes = NULL; + int call_count = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + local = dht_local_init(frame, loc, fd, GF_FOP_OPENDIR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + local->first_up_subvol = dht_first_up_subvol(this); + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + op_errno = ENOMEM; + goto err; + } + new_xdata = _gf_true; + } + + ret = dict_set_uint32(xdata, conf->link_xattr_name, 256); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value : key = %s", + conf->link_xattr_name); + + /* dht_readdirp will wind to all subvols so open has to be sent to + * all subvols whether or not conf->local_subvols is set */ + + call_count = local->call_cnt = conf->subvolume_cnt; + subvolumes = conf->subvolumes; + + /* In case of parallel-readdir, the readdir-ahead will be loaded + * below dht, in this case, if we want to enable or disable SKIP_DIRs + * it has to be done in opendir, so that prefetching logic in + * readdir-ahead, honors it */ + for (i = 0; i < call_count; i++) { + if (conf->readdir_optimize == _gf_true) { + if (subvolumes[i] != local->first_up_subvol) { + ret = dict_set_int32(xdata, GF_READDIR_SKIP_DIRS, 1); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary" + " value :key = %s, ret:%d", + GF_READDIR_SKIP_DIRS, ret); + } + } + + STACK_WIND_COOKIE(frame, dht_fd_cbk, subvolumes[i], subvolumes[i], + subvolumes[i]->fops->opendir, loc, fd, xdata); + dict_del(xdata, GF_READDIR_SKIP_DIRS); + } + + if (new_xdata) + dict_unref(xdata); + + return 0; +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(opendir, frame, -1, op_errno, NULL, NULL); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + return 0; +} - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } +/* dht_readdirp_cbk creates a new dentry and dentry->inode is not assigned. + This functions assigns an inode if all of the following conditions are + true: - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; + * DHT has only one child. In this case the entire layout is present on + this single child and hence we can set complete layout in inode. + * backend has complete layout and there are no anomalies in it and from + this information layout can be constructed and set in inode. +*/ - goto err; - } +static void +dht_populate_inode_for_dentry(xlator_t *this, xlator_t *subvol, + gf_dirent_t *entry, gf_dirent_t *orig_entry) +{ + dht_layout_t *layout = NULL; + int ret = 0; + loc_t loc = { + 0, + }; + + if (gf_uuid_is_null(orig_entry->d_stat.ia_gfid)) { + /* this skips the '..' entry for the root of the volume */ + return; + } - local->fd = fd_ref (fd); - local->call_cnt = 1; + gf_uuid_copy(loc.gfid, orig_entry->d_stat.ia_gfid); + loc.inode = inode_ref(orig_entry->inode); - STACK_WIND (frame, dht_err_cbk, - subvol, subvol->fops->flush, fd); + if (is_revalidate(&loc)) { + goto out; + } - return 0; + layout = dht_layout_new(this, 1); + if (!layout) + goto out; -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (flush, frame, -1, op_errno); + ret = dht_layout_merge(this, layout, subvol, 0, 0, orig_entry->dict); + if (!ret) { + ret = dht_layout_normalize(this, &loc, layout); + if (ret == 0) { + dht_layout_set(this, orig_entry->inode, layout); + entry->inode = inode_ref(orig_entry->inode); + layout = NULL; + } + } - return 0; -} + if (layout) + dht_layout_unref(this, layout); +out: + loc_wipe(&loc); + return; +} -int -dht_fsync (call_frame_t *frame, xlator_t *this, - fd_t *fd, int datasync) +/* Posix returns op_errno = ENOENT to indicate that there are no more + * entries + */ +static int +dht_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *orig_entries, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - + dht_local_t *local = NULL; + gf_dirent_t entries; + gf_dirent_t *orig_entry = NULL; + gf_dirent_t *entry = NULL; + xlator_t *prev = NULL; + xlator_t *next_subvol = NULL; + off_t next_offset = 0; + int count = 0; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; + xlator_t *subvol = 0; + xlator_t *hashed_subvol = 0; + int ret = 0; + int readdir_optimize = 0; + inode_table_t *itable = NULL; + inode_t *inode = NULL; + gf_boolean_t skip_hashed_check = _gf_false; + + INIT_LIST_HEAD(&entries.list); + + prev = cookie; + local = frame->local; + GF_VALIDATE_OR_GOTO(this->name, local->fd, unwind); + + itable = local->fd->inode->table; + + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, unwind); + + methods = &(conf->methods); + + if (op_ret <= 0) { + goto done; + } + + /* Why aren't we skipping DHT entirely in case of a single subvol? + * Because if this was a larger volume earlier and all but one subvol + * was removed, there might be stale linkto files on the subvol. + */ + if (conf->subvolume_cnt == 1) { + /* return all directory and file entries except + * linkto files for a single child DHT + */ + skip_hashed_check = _gf_true; + } - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + if (!local->layout) + local->layout = dht_layout_get(this, local->fd->inode); - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; + layout = local->layout; + + /* This will skip the entries on the subvol without a layout, + * hence preventing the crash but rmdir might fail with + * "directory not empty" errors*/ + + if (layout == NULL) + goto done; + + if (conf->readdir_optimize == _gf_true) + readdir_optimize = 1; + + gf_msg_debug(this->name, 0, "Processing entries from %s", prev->name); + + list_for_each_entry(orig_entry, (&orig_entries->list), list) + { + next_offset = orig_entry->d_off; + + gf_msg_debug(this->name, 0, "%s: entry = %s, type = %d", prev->name, + orig_entry->d_name, orig_entry->d_type); + + if (IA_ISINVAL(orig_entry->d_stat.ia_type)) { + /*stat failed somewhere- display this entry but the data may + * be inaccurate. + */ + gf_msg_debug(this->name, EINVAL, "Invalid stat for %s (gfid %s)", + orig_entry->d_name, + uuid_utoa(orig_entry->d_stat.ia_gfid)); + } + + if (check_is_linkfile(NULL, (&orig_entry->d_stat), orig_entry->dict, + conf->link_xattr_name)) { + gf_msg_debug(this->name, 0, "%s: %s is a linkto file", prev->name, + orig_entry->d_name); + continue; + } + + if (skip_hashed_check) { + goto list; + } + + if (check_is_dir(NULL, (&orig_entry->d_stat), NULL)) { + /*Directory entries filtering : + * a) If rebalance is running, pick from first_up_subvol + * b) (rebalance not running)hashed subvolume is NULL or + * down then filter in first_up_subvolume. Other wise the + * corresponding hashed subvolume will take care of the + * directory entry. + */ + if (readdir_optimize) { + if (prev == local->first_up_subvol) + goto list; + else + continue; + } + + hashed_subvol = methods->layout_search(this, layout, + orig_entry->d_name); + + if (prev == hashed_subvol) + goto list; + if ((hashed_subvol && dht_subvol_status(conf, hashed_subvol)) || + (prev != local->first_up_subvol)) + continue; + + goto list; + } + + list: + entry = gf_dirent_for_name(orig_entry->d_name); + if (!entry) { + goto unwind; + } + + /* Do this if conf->search_unhashed is set to "auto" */ + if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) { + subvol = methods->layout_search(this, layout, orig_entry->d_name); + if (!subvol || (subvol != prev)) { + /* TODO: Count the number of entries which need + linkfile to prove its existence in fs */ + layout->search_unhashed++; + } + } + + entry->d_off = orig_entry->d_off; + entry->d_stat = orig_entry->d_stat; + entry->d_ino = orig_entry->d_ino; + entry->d_type = orig_entry->d_type; + entry->d_len = orig_entry->d_len; + + if (orig_entry->dict) + entry->dict = dict_ref(orig_entry->dict); + + /* making sure we set the inode ctx right with layout, + currently possible only for non-directories, so for + directories don't set entry inodes */ + if (IA_ISDIR(entry->d_stat.ia_type)) { + entry->d_stat.ia_blocks = DHT_DIR_STAT_BLOCKS; + entry->d_stat.ia_size = DHT_DIR_STAT_SIZE; + if (orig_entry->inode) { + dht_inode_ctx_time_update(orig_entry->inode, this, + &entry->d_stat, 1); + + if (conf->subvolume_cnt == 1) { + dht_populate_inode_for_dentry(this, prev, entry, + orig_entry); + } + } + } else { + if (orig_entry->dict && + dict_get(orig_entry->dict, conf->link_xattr_name)) { + /* Strip out the S and T flags set by rebalance*/ + DHT_STRIP_PHASE1_FLAGS(&entry->d_stat); + } + + if (orig_entry->inode) { + ret = dht_layout_preset(this, prev, orig_entry->inode); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_LAYOUT_SET_FAILED, + "failed to link the layout " + "in inode for %s", + orig_entry->d_name); + + entry->inode = inode_ref(orig_entry->inode); + } else if (itable) { + /* + * orig_entry->inode might be null if any upper + * layer xlators below client set to null, to + * force a lookup on the inode even if the inode + * is present in the inode table. In that case + * we just update the ctx to make sure we didn't + * missed anything. + */ + inode = inode_find(itable, orig_entry->d_stat.ia_gfid); + if (inode) { + ret = dht_layout_preset(this, prev, inode); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_LAYOUT_SET_FAILED, + "failed to link the layout" + " in inode for %s", + orig_entry->d_name); + inode_unref(inode); + inode = NULL; + } + } } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; + gf_msg_debug(this->name, 0, "%s: Adding entry = %s", prev->name, + entry->d_name); - goto err; - } - local->call_cnt = 1; + list_add_tail(&entry->list, &entries.list); + count++; + } - local->ia_ino = fd->inode->ino; +done: - STACK_WIND (frame, dht_fsync_cbk, - subvol, subvol->fops->fsync, - fd, datasync); + /* We need to ensure that only the last subvolume's end-of-directory + * notification is respected so that directory reading does not stop + * before all subvolumes have been read. That could happen because the + * posix for each subvolume sends a ENOENT on end-of-directory but in + * distribute we're not concerned only with a posix's view of the + * directory but the aggregated namespace' view of the directory. + * Possible values: + * op_ret == 0 and op_errno != 0 + * if op_errno != ENOENT : Error.Unwind. + * if op_errno == ENOENT : There are no more entries on this subvol. + * Move to the next one. + * op_ret > 0 and count == 0 : + * The subvol returned entries to dht but all were stripped out. + * For example, if they were linkto files or dirs where + * hashed_subvol != prev. Try to get some entries by winding + * to the next subvol. This can be dangerous if parallel readdir + * is enabled as it grows the stack. + * + * op_ret > 0 and count > 0: + * We found some entries. Unwind even if the buffer is not full. + * + */ + + op_ret = count; + if (count == 0) { + /* non-zero next_offset means that + * EOF is not yet hit on the current subvol + */ + if ((next_offset == 0) || (op_errno == ENOENT)) { + next_offset = 0; + next_subvol = dht_subvol_next(this, prev); + } else { + next_subvol = prev; + } - return 0; + if (!next_subvol) { + goto unwind; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL); + if (conf->readdir_optimize == _gf_true) { + if (next_subvol != local->first_up_subvol) { + ret = dict_set_int32(local->xattr, GF_READDIR_SKIP_DIRS, 1); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value" + ":key = %s", + GF_READDIR_SKIP_DIRS); + } else { + dict_del(local->xattr, GF_READDIR_SKIP_DIRS); + } + } + STACK_WIND_COOKIE(frame, dht_readdirp_cbk, next_subvol, next_subvol, + next_subvol->fops->readdirp, local->fd, local->size, + next_offset, local->xattr); return 0; -} + } +unwind: + /* We need to ensure that only the last subvolume's end-of-directory + * notification is respected so that directory reading does not stop + * before all subvolumes have been read. That could happen because the + * posix for each subvolume sends a ENOENT on end-of-directory but in + * distribute we're not concerned only with a posix's view of the + * directory but the aggregated namespace' view of the directory. + */ + if (op_ret < 0) + op_ret = 0; + + if (prev != dht_last_up_subvol(this)) + op_errno = 0; + + DHT_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL); + + gf_dirent_free(&entries); + return 0; +} -int -dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct gf_flock *flock) +static int +dht_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *orig_entries, dict_t *xdata) { - DHT_STACK_UNWIND (lk, frame, op_ret, op_errno, flock); + dht_local_t *local = NULL; + gf_dirent_t entries; + gf_dirent_t *orig_entry = NULL; + gf_dirent_t *entry = NULL; + xlator_t *prev = NULL; + xlator_t *next_subvol = NULL; + off_t next_offset = 0; + int count = 0; + dht_layout_t *layout = 0; + xlator_t *subvol = 0; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; + gf_boolean_t skip_hashed_check = _gf_false; - return 0; -} + INIT_LIST_HEAD(&entries.list); + prev = cookie; + local = frame->local; -int -dht_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int cmd, struct gf_flock *flock) -{ - xlator_t *subvol = NULL; - int op_errno = -1; + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, done); + methods = &(conf->methods); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + if (op_ret <= 0) + goto done; - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + if (!local->layout) + local->layout = dht_layout_get(this, local->fd->inode); - STACK_WIND (frame, dht_lk_cbk, - subvol, subvol->fops->lk, - fd, cmd, flock); + layout = local->layout; - return 0; + gf_msg_debug(this->name, 0, "Processing entries from %s", prev->name); -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lk, frame, -1, op_errno, NULL); + if (conf->subvolume_cnt == 1) { + /*return everything*/ + skip_hashed_check = _gf_true; + count = op_ret; + goto done; + } - return 0; -} + list_for_each_entry(orig_entry, (&orig_entries->list), list) + { + next_offset = orig_entry->d_off; -/* - * dht_normalize_stats - - */ -static void -dht_normalize_stats (struct statvfs *buf, unsigned long bsize, - unsigned long frsize) -{ - double factor = 0; + gf_msg_debug(this->name, 0, "%s: entry = %s, type = %d", prev->name, + orig_entry->d_name, orig_entry->d_type); - if (buf->f_bsize != bsize) { - buf->f_bsize = bsize; - } + subvol = methods->layout_search(this, layout, orig_entry->d_name); - if (buf->f_frsize != frsize) { - factor = ((double) buf->f_frsize) / frsize; - buf->f_frsize = frsize; - buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks); - buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree); - buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail); + if (!subvol || (subvol == prev)) { + entry = gf_dirent_for_name(orig_entry->d_name); + if (!entry) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "Memory allocation failed "); + goto unwind; + } + + entry->d_off = orig_entry->d_off; + entry->d_ino = orig_entry->d_ino; + entry->d_type = orig_entry->d_type; + entry->d_len = orig_entry->d_len; + + gf_msg_debug(this->name, 0, "%s: Adding = entry %s", prev->name, + entry->d_name); + list_add_tail(&entry->list, &entries.list); + count++; } + } +done: + op_ret = count; + /* We need to ensure that only the last subvolume's end-of-directory + * notification is respected so that directory reading does not stop + * before all subvolumes have been read. That could happen because the + * posix for each subvolume sends a ENOENT on end-of-directory but in + * distribute we're not concerned only with a posix's view of the + * directory but the aggregated namespace' view of the directory. + */ + if (count == 0) { + if ((next_offset == 0) || (op_errno == ENOENT)) { + next_offset = 0; + next_subvol = dht_subvol_next(this, prev); + } else { + next_subvol = prev; + } + + if (!next_subvol) { + goto unwind; + } + + STACK_WIND_COOKIE(frame, dht_readdir_cbk, next_subvol, next_subvol, + next_subvol->fops->readdir, local->fd, local->size, + next_offset, NULL); + return 0; + } + +unwind: + /* We need to ensure that only the last subvolume's end-of-directory + * notification is respected so that directory reading does not stop + * before all subvolumes have been read. That could happen because the + * posix for each subvolume sends a ENOENT on end-of-directory but in + * distribute we're not concerned only with a posix's view of the + * directory but the aggregated namespace' view of the directory. + */ + + if (prev != dht_last_up_subvol(this)) + op_errno = 0; + + if (!skip_hashed_check) { + DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL); + gf_dirent_free(&entries); + + } else { + DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, orig_entries, NULL); + } + return 0; } -int -dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct statvfs *statvfs) +static int +dht_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t yoff, int whichop, dict_t *dict) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - int bsize = 0; - int frsize = 0; + dht_local_t *local = NULL; + int op_errno = -1; + xlator_t *xvol = NULL; + int ret = 0; + dht_conf_t *conf = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + local = dht_local_init(frame, NULL, NULL, whichop); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->fd = fd_ref(fd); + local->size = size; + local->xattr_req = (dict) ? dict_ref(dict) : NULL; + local->first_up_subvol = dht_first_up_subvol(this); + local->op_ret = -1; + + dht_deitransform(this, yoff, &xvol); + + /* TODO: do proper readdir */ + if (whichop == GF_FOP_READDIRP) { + if (dict) + local->xattr = dict_ref(dict); + else + local->xattr = dict_new(); + + if (local->xattr) { + ret = dict_set_uint32(local->xattr, conf->link_xattr_name, 256); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value" + " : key = %s", + conf->link_xattr_name); + + if (conf->readdir_optimize == _gf_true) { + if (xvol != local->first_up_subvol) { + ret = dict_set_int32(local->xattr, GF_READDIR_SKIP_DIRS, 1); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_DICT_SET_FAILED, + "Failed to set " + "dictionary value: " + "key = %s", + GF_READDIR_SKIP_DIRS); + } else { + dict_del(local->xattr, GF_READDIR_SKIP_DIRS); + } + } + + if (conf->subvolume_cnt == 1) { + ret = dict_set_uint32(local->xattr, conf->xattr_name, 4 * 4); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, + DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary " + "value:key = %s ", + conf->xattr_name); + } + } + } + STACK_WIND_COOKIE(frame, dht_readdirp_cbk, xvol, xvol, + xvol->fops->readdirp, fd, size, yoff, local->xattr); + } else { + STACK_WIND_COOKIE(frame, dht_readdir_cbk, xvol, xvol, + xvol->fops->readdir, fd, size, yoff, local->xattr); + } - local = frame->local; + return 0; - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - goto unlock; - } - local->op_ret = 0; +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL); - if (local->statvfs.f_bsize != 0) { - bsize = max(local->statvfs.f_bsize, statvfs->f_bsize); - frsize = max(local->statvfs.f_frsize, statvfs->f_frsize); - dht_normalize_stats(&local->statvfs, bsize, frsize); - dht_normalize_stats(statvfs, bsize, frsize); - } else { - local->statvfs.f_bsize = statvfs->f_bsize; - local->statvfs.f_frsize = statvfs->f_frsize; - } + return 0; +} + +int +dht_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t yoff, dict_t *xdata) +{ + int op = GF_FOP_READDIR; + dht_conf_t *conf = NULL; + int i = 0; - local->statvfs.f_blocks += statvfs->f_blocks; - local->statvfs.f_bfree += statvfs->f_bfree; - local->statvfs.f_bavail += statvfs->f_bavail; - local->statvfs.f_files += statvfs->f_files; - local->statvfs.f_ffree += statvfs->f_ffree; - local->statvfs.f_favail += statvfs->f_favail; - local->statvfs.f_fsid = statvfs->f_fsid; - local->statvfs.f_flag = statvfs->f_flag; - local->statvfs.f_namemax = statvfs->f_namemax; + conf = this->private; + if (!conf) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->subvolume_status[i]) { + op = GF_FOP_READDIRP; + break; } -unlock: - UNLOCK (&frame->lock); + } + if (conf->use_readdirp) + op = GF_FOP_READDIRP; - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, - &local->statvfs); +out: + dht_do_readdir(frame, this, fd, size, yoff, op, 0); + return 0; +} - return 0; +int +dht_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t yoff, dict_t *dict) +{ + dht_do_readdir(frame, this, fd, size, yoff, GF_FOP_READDIRP, dict); + return 0; } +static int +dht_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + + local = frame->local; + + LOCK(&frame->lock); + { + if (op_ret == -1) + local->op_errno = op_errno; + else if (op_ret == 0) + local->op_ret = 0; + } + UNLOCK(&frame->lock); + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) + DHT_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno, + xdata); + + return 0; +} int -dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) +dht_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int op_errno = -1; - int i = -1; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + VALIDATE_OR_GOTO(this->private, err); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (this->private, err); + conf = this->private; - conf = this->private; + local = dht_local_init(frame, NULL, NULL, GF_FOP_FSYNCDIR); + if (!local) { + op_errno = ENOMEM; + goto err; + } - local = dht_local_init (frame); - local->call_cnt = conf->subvolume_cnt; + local->fd = fd_ref(fd); + local->call_cnt = conf->subvolume_cnt; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_statfs_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->statfs, loc); - } + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND(frame, dht_fsyncdir_cbk, conf->subvolumes[i], + conf->subvolumes[i]->fops->fsyncdir, fd, datasync, xdata); + } - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (statfs, frame, -1, op_errno, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fsyncdir, frame, -1, op_errno, NULL); - return 0; + return 0; } - int -dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +dht_newfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - int op_errno = -1; - int i = -1; + xlator_t *prev = NULL; + int ret = -1; + dht_local_t *local = NULL; + + if (op_ret == -1) + goto out; + + local = frame->local; + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + prev = cookie; + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0); + dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1); + } + + ret = dht_layout_preset(this, prev, inode); + if (ret < 0) { + gf_msg_debug(this->name, EINVAL, + "could not set pre-set layout for subvolume %s", + prev ? prev->name : NULL); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + if (local->linked == _gf_true) + dht_linkfile_attr_heal(frame, this); +out: + /* + * FIXME: ia_size and st_blocks of preparent and postparent do not have + * correct values. since, preparent and postparent buffers correspond + * to a directory these two members should have values equal to sum of + * corresponding values from each of the subvolume. + * See dht_iatt_merge for reference. + */ + DHT_STRIP_PHASE1_FLAGS(stbuf); + dht_set_fixed_dir_stat(postparent); + dht_set_fixed_dir_stat(preparent); + + if (local && local->lock[0].layout.parent_layout.locks) { + /* store op_errno for failure case*/ + local->op_errno = op_errno; + local->refresh_layout_unlock(frame, this, op_ret, 1); + if (op_ret == 0) { + DHT_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, stbuf, + preparent, postparent, xdata); + } + } else { + DHT_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, stbuf, + preparent, postparent, xdata); + } - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (this->private, err); + return 0; +} - conf = this->private; +static int +dht_mknod_linkfile_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *cached_subvol = NULL; + dht_conf_t *conf = NULL; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; + local = frame->local; - goto err; - } + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + goto err; + } - local->fd = fd_ref (fd); - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; + if (op_ret == -1) { + local->op_errno = op_errno; + goto err; + } - goto err; - } + conf = this->private; + if (!conf) { + local->op_errno = EINVAL; + op_errno = EINVAL; + goto err; + } - local->call_cnt = conf->subvolume_cnt; + cached_subvol = local->cached_subvol; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_fd_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->opendir, - loc, fd); - } + if (local->params) { + dict_del(local->params, conf->link_xattr_name); + dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY); + } - return 0; + STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)cached_subvol, + cached_subvol, cached_subvol->fops->mknod, &local->loc, + local->mode, local->rdev, local->umask, local->params); + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (opendir, frame, -1, op_errno, NULL); - - return 0; + if (local && local->lock[0].layout.parent_layout.locks) { + local->refresh_layout_unlock(frame, this, -1, 1); + } else { + DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + } + return 0; } +static int +dht_mknod_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this, + xlator_t *subvol, loc_t *loc, dev_t rdev, + mode_t mode, mode_t umask, dict_t *params) +{ + dht_local_t *local = NULL; + xlator_t *avail_subvol = NULL; + + local = frame->local; -int -dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, - int op_errno, gf_dirent_t *orig_entries) -{ - dht_local_t *local = NULL; - gf_dirent_t entries; - gf_dirent_t *orig_entry = NULL; - gf_dirent_t *entry = NULL; - call_frame_t *prev = NULL; - xlator_t *next_subvol = NULL; - off_t next_offset = 0; - int count = 0; - dht_layout_t *layout = 0; - dht_conf_t *conf = NULL; - xlator_t *subvol = 0; - - INIT_LIST_HEAD (&entries.list); - prev = cookie; - local = frame->local; - conf = this->private; + if (!dht_is_subvol_filled(this, subvol)) { + gf_msg_debug(this->name, 0, "creating %s on %s", loc->path, + subvol->name); - if (op_ret < 0) - goto done; + STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, params); + } else { + avail_subvol = dht_free_disk_available_subvol(this, subvol, local); - if (!local->layout) - local->layout = dht_layout_get (this, local->fd->inode); + if (avail_subvol != subvol) { + local->params = dict_ref(params); + local->rdev = rdev; + local->mode = mode; + local->umask = umask; + local->cached_subvol = avail_subvol; + local->hashed_subvol = subvol; - layout = local->layout; + gf_msg_debug(this->name, 0, "creating %s on %s (link at %s)", + loc->path, avail_subvol->name, subvol->name); - list_for_each_entry (orig_entry, (&orig_entries->list), list) { - next_offset = orig_entry->d_off; + dht_linkfile_create(frame, dht_mknod_linkfile_create_cbk, this, + avail_subvol, subvol, loc); - if (check_is_linkfile (NULL, (&orig_entry->d_stat), NULL) - || (check_is_dir (NULL, (&orig_entry->d_stat), NULL) - && (prev->this != dht_first_up_subvol (this)))) { - continue; - } + goto out; + } - entry = gf_dirent_for_name (orig_entry->d_name); - if (!entry) { + gf_msg_debug(this->name, 0, "creating %s on %s", loc->path, + subvol->name); - goto unwind; - } + STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, params); + } +out: + return 0; +} - /* Do this if conf->search_unhashed is set to "auto" */ - if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) { - subvol = dht_layout_search (this, layout, - orig_entry->d_name); - if (!subvol || (subvol != prev->this)) { - /* TODO: Count the number of entries which need - linkfile to prove its existance in fs */ - layout->search_unhashed++; - } - } - entry->d_stat = orig_entry->d_stat; - - dht_itransform (this, prev->this, orig_entry->d_ino, - &entry->d_ino); - dht_itransform (this, prev->this, orig_entry->d_off, - &entry->d_off); - - entry->d_stat.ia_ino = entry->d_ino; - entry->d_type = orig_entry->d_type; - entry->d_len = orig_entry->d_len; - - list_add_tail (&entry->list, &entries.list); - count++; - } - op_ret = count; - /* We need to ensure that only the last subvolume's end-of-directory - * notification is respected so that directory reading does not stop - * before all subvolumes have been read. That could happen because the - * posix for each subvolume sends a ENOENT on end-of-directory but in - * distribute we're not concerned only with a posix's view of the - * directory but the aggregated namespace' view of the directory. - */ - if (prev->this != dht_last_up_subvol (this)) - op_errno = 0; +static int32_t +dht_mknod_do(call_frame_t *frame) +{ + dht_local_t *local = NULL; + dht_layout_t *refreshed = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; -done: - if (count == 0) { - /* non-zero next_offset means that - EOF is not yet hit on the current subvol - */ - if (next_offset == 0) { - next_subvol = dht_subvol_next (this, prev->this); - } else { - next_subvol = prev->this; - } + local = frame->local; - if (!next_subvol) { - goto unwind; - } + this = THIS; - STACK_WIND (frame, dht_readdirp_cbk, - next_subvol, next_subvol->fops->readdirp, - local->fd, local->size, next_offset); - return 0; - } + conf = this->private; -unwind: - if (op_ret < 0) - op_ret = 0; + GF_VALIDATE_OR_GOTO(this->name, conf, err); - DHT_STACK_UNWIND (readdirp, frame, op_ret, op_errno, &entries); + methods = &(conf->methods); - gf_dirent_free (&entries); + /* We don't need parent_loc anymore */ + loc_wipe(&local->loc); - return 0; -} + loc_copy(&local->loc, &local->loc2); + loc_wipe(&local->loc2); + refreshed = local->selfheal.refreshed_layout; -int -dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *orig_entries) -{ - dht_local_t *local = NULL; - gf_dirent_t entries; - gf_dirent_t *orig_entry = NULL; - gf_dirent_t *entry = NULL; - call_frame_t *prev = NULL; - xlator_t *next_subvol = NULL; - off_t next_offset = 0; - int count = 0; - dht_layout_t *layout = 0; - dht_conf_t *conf = NULL; - xlator_t *subvol = 0; - - INIT_LIST_HEAD (&entries.list); - prev = cookie; - local = frame->local; - conf = this->private; + subvol = methods->layout_search(this, refreshed, local->loc.name); - if (op_ret < 0) - goto done; + if (!subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "no subvolume in " + "layout for path=%s", + local->loc.path); + local->op_errno = ENOENT; + goto err; + } - if (!local->layout) - local->layout = dht_layout_get (this, local->fd->inode); + dht_mknod_wind_to_avail_subvol(frame, this, subvol, &local->loc, + local->rdev, local->mode, local->umask, + local->params); + return 0; +err: + local->refresh_layout_unlock(frame, this, -1, 1); - layout = local->layout; + return 0; +} - list_for_each_entry (orig_entry, (&orig_entries->list), list) { - next_offset = orig_entry->d_off; +static int32_t +dht_mknod_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + DHT_STACK_DESTROY(frame); + return 0; +} - subvol = dht_layout_search (this, layout, orig_entry->d_name); +static int32_t +dht_mknod_finish(call_frame_t *frame, xlator_t *this, int op_ret, + int invoke_cbk) +{ + dht_local_t *local = NULL, *lock_local = NULL; + call_frame_t *lock_frame = NULL; + int lock_count = 0; + + local = frame->local; + lock_count = dht_lock_count(local->lock[0].layout.parent_layout.locks, + local->lock[0].layout.parent_layout.lk_count); + if (lock_count == 0) + goto done; + + lock_frame = copy_frame(frame); + if (lock_frame == NULL) { + goto done; + } + + lock_local = dht_local_init(lock_frame, &local->loc, NULL, + lock_frame->root->op); + if (lock_local == NULL) { + goto done; + } + + lock_local->lock[0] + .layout.parent_layout.locks = local->lock[0].layout.parent_layout.locks; + lock_local->lock[0].layout.parent_layout.lk_count = + local->lock[0].layout.parent_layout.lk_count; + + local->lock[0].layout.parent_layout.locks = NULL; + local->lock[0].layout.parent_layout.lk_count = 0; + + dht_unlock_inodelk(lock_frame, + lock_local->lock[0].layout.parent_layout.locks, + lock_local->lock[0].layout.parent_layout.lk_count, + dht_mknod_unlock_cbk); + lock_frame = NULL; - if (!subvol || (subvol == prev->this)) { - entry = gf_dirent_for_name (orig_entry->d_name); - if (!entry) { - gf_log (this->name, GF_LOG_ERROR, - "memory allocation failed :("); - goto unwind; - } +done: + if (lock_frame != NULL) { + DHT_STACK_DESTROY(lock_frame); + } - dht_itransform (this, prev->this, orig_entry->d_ino, - &entry->d_ino); - dht_itransform (this, prev->this, orig_entry->d_off, - &entry->d_off); + if (op_ret == 0) + return 0; - entry->d_type = orig_entry->d_type; - entry->d_len = orig_entry->d_len; + DHT_STACK_UNWIND(mknod, frame, op_ret, local->op_errno, NULL, NULL, NULL, + NULL, NULL); + return 0; +} - list_add_tail (&entry->list, &entries.list); - count++; - } - } - op_ret = count; - /* We need to ensure that only the last subvolume's end-of-directory - * notification is respected so that directory reading does not stop - * before all subvolumes have been read. That could happen because the - * posix for each subvolume sends a ENOENT on end-of-directory but in - * distribute we're not concerned only with a posix's view of the - * directory but the aggregated namespace' view of the directory. - */ - if (prev->this != dht_last_up_subvol (this)) - op_errno = 0; +static int32_t +dht_mknod_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; -done: - if (count == 0) { - /* non-zero next_offset means that - EOF is not yet hit on the current subvol - */ - if (next_offset == 0) { - next_subvol = dht_subvol_next (this, prev->this); - } else { - next_subvol = prev->this; - } + local = frame->local; - if (!next_subvol) { - goto unwind; - } + if (!local) { + goto err; + } - STACK_WIND (frame, dht_readdir_cbk, - next_subvol, next_subvol->fops->readdir, - local->fd, local->size, next_offset); - return 0; - } + if (op_ret < 0) { + gf_msg("DHT", GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR, + "mknod lock failed for file: %s", local->loc2.name); -unwind: - if (op_ret < 0) - op_ret = 0; + local->op_errno = op_errno; - DHT_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries); + goto err; + } - gf_dirent_free (&entries); + local->refresh_layout_unlock = dht_mknod_finish; - return 0; -} + local->refresh_layout_done = dht_mknod_do; + dht_refresh_layout(frame); -int -dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff, int whichop) + return 0; +err: + if (local) + dht_mknod_finish(frame, this, -1, 0); + else + DHT_STACK_UNWIND(mknod, frame, -1, EINVAL, NULL, NULL, NULL, NULL, + NULL); + return 0; +} + +static int32_t +dht_mknod_lock(call_frame_t *frame, xlator_t *subvol) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int op_errno = -1; - xlator_t *xvol = NULL; - off_t xoff = 0; + dht_local_t *local = NULL; + int count = 1, ret = -1; + dht_lock_t **lk_array = NULL; + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + local = frame->local; - conf = this->private; + lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer); - local = dht_local_init (frame); - if (!local) { + if (lk_array == NULL) + goto err; - op_errno = ENOMEM; - goto err; - } + lk_array[0] = dht_lock_new(frame->this, subvol, &local->loc, F_RDLCK, + DHT_LAYOUT_HEAL_DOMAIN, NULL, + IGNORE_ENOENT_ESTALE); - local->fd = fd_ref (fd); - local->size = size; + if (lk_array[0] == NULL) + goto err; - dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); + local->lock[0].layout.parent_layout.locks = lk_array; + local->lock[0].layout.parent_layout.lk_count = count; - /* TODO: do proper readdir */ - if (whichop == GF_FOP_READDIR) - STACK_WIND (frame, dht_readdir_cbk, xvol, xvol->fops->readdir, - fd, size, xoff); - else - STACK_WIND (frame, dht_readdirp_cbk, xvol, xvol->fops->readdirp, - fd, size, xoff); + ret = dht_blocking_inodelk(frame, lk_array, count, dht_mknod_lock_cbk); - return 0; + if (ret < 0) { + local->lock[0].layout.parent_layout.locks = NULL; + local->lock[0].layout.parent_layout.lk_count = 0; + goto err; + } + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (readdir, frame, -1, op_errno, NULL); + if (lk_array != NULL) { + dht_lock_array_free(lk_array, count); + GF_FREE(lk_array); + } - return 0; + return -1; } - -int -dht_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff) +static int +dht_refresh_parent_layout_resume(call_frame_t *frame, xlator_t *this, int ret, + int invoke_cbk) { - int op = GF_FOP_READDIR; - dht_conf_t *conf = NULL; - int i = 0; + dht_local_t *local = NULL, *parent_local = NULL; + call_stub_t *stub = NULL; + call_frame_t *parent_frame = NULL; - conf = this->private; - if (!conf) - goto out; + local = frame->local; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (!conf->subvolume_status[i]) { - op = GF_FOP_READDIRP; - break; - } - } + stub = local->stub; + local->stub = NULL; - if (conf->use_readdirp) - op = GF_FOP_READDIRP; + parent_frame = stub->frame; + parent_local = parent_frame->local; -out: - dht_do_readdir (frame, this, fd, size, yoff, op); - return 0; + if (ret < 0) { + parent_local->op_ret = -1; + parent_local->op_errno = local->op_errno ? local->op_errno : EIO; + } else { + parent_local->op_ret = 0; + } + + call_resume(stub); + + DHT_STACK_DESTROY(frame); + + return 0; } -int -dht_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff) +static int +dht_refresh_parent_layout_done(call_frame_t *frame) { - dht_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP); - return 0; -} + dht_local_t *local = NULL; + int ret = 0; + local = frame->local; + if (local->op_ret < 0) { + ret = -1; + goto resume; + } -int -dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + dht_layout_set(frame->this, local->loc.inode, + local->selfheal.refreshed_layout); + +resume: + dht_refresh_parent_layout_resume(frame, frame->this, ret, 1); + return 0; +} + +static int +dht_handle_parent_layout_change(xlator_t *this, call_stub_t *stub) { - dht_local_t *local = NULL; - int this_call_cnt = 0; + call_frame_t *refresh_frame = NULL, *frame = NULL; + dht_local_t *refresh_local = NULL, *local = NULL; + frame = stub->frame; + local = frame->local; - local = frame->local; + refresh_frame = copy_frame(frame); + if (!refresh_frame) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "mem allocation failed for refresh_frame"); + return -1; + } - LOCK (&frame->lock); - { - if (op_ret == -1) - local->op_errno = op_errno; + refresh_local = dht_local_init(refresh_frame, NULL, NULL, stub->fop); + if (!refresh_local) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "mem allocation failed for refresh_local"); + return -1; + } - if (op_ret == 0) - local->op_ret = 0; - } - UNLOCK (&frame->lock); + refresh_local->loc.inode = inode_ref(local->loc.parent); + gf_uuid_copy(refresh_local->loc.gfid, local->loc.parent->gfid); - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (fsyncdir, frame, local->op_ret, local->op_errno); + refresh_local->stub = stub; - return 0; + refresh_local->refresh_layout_unlock = dht_refresh_parent_layout_resume; + refresh_local->refresh_layout_done = dht_refresh_parent_layout_done; + + dht_refresh_layout(refresh_frame); + return 0; } +static int32_t +dht_call_mkdir_stub(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_stub_t *stub = NULL; + + local = frame->local; + stub = local->stub; + local->stub = NULL; + + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; + } else { + local->op_ret = 0; + } + + call_resume(stub); + + return 0; +} + +static int32_t +dht_guard_parent_layout_and_namespace(xlator_t *subvol, call_stub_t *stub) +{ + dht_local_t *local = NULL; + int ret = -1; + loc_t *loc = NULL; + xlator_t *hashed_subvol = NULL, *this = NULL; + ; + call_frame_t *frame = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + int32_t *parent_disk_layout = NULL; + dht_layout_t *parent_layout = NULL; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO("dht", stub, err); + + frame = stub->frame; + this = frame->this; + + conf = this->private; + + local = frame->local; + + local->stub = stub; + + /* TODO: recheck whether we should lock on src or dst if we do similar + * stale layout checks for rename. + */ + loc = &stub->args.loc; + + gf_uuid_unparse(loc->parent->gfid, pgfid); + + if (local->params == NULL) { + local->params = dict_new(); + if (local->params == NULL) { + local->op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "%s (%s/%s) (path: %s): " + "dict allocation failed", + gf_fop_list[stub->fop], pgfid, loc->name, loc->path); + goto err; + } + } + + hashed_subvol = dht_subvol_get_hashed(this, loc); + if (hashed_subvol == NULL) { + local->op_errno = EINVAL; + + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "%s (%s/%s) (path: %s): " + "hashed subvolume not found", + gf_fop_list[stub->fop], pgfid, loc->name, loc->path); + goto err; + } + + parent_layout = dht_layout_get(this, loc->parent); + + ret = dht_disk_layout_extract_for_subvol(this, parent_layout, hashed_subvol, + &parent_disk_layout); + if (ret == -1) { + local->op_errno = EINVAL; + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "%s (%s/%s) (path: %s): " + "extracting in-memory layout of parent failed. ", + gf_fop_list[stub->fop], pgfid, loc->name, loc->path); + goto err; + } + + memcpy((void *)local->parent_disk_layout, (void *)parent_disk_layout, + sizeof(local->parent_disk_layout)); + + dht_layout_unref(this, parent_layout); + parent_layout = NULL; + + ret = dict_set_str(local->params, GF_PREOP_PARENT_KEY, conf->xattr_name); + if (ret < 0) { + local->op_errno = -ret; + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "%s (%s/%s) (path: %s): " + "setting %s key in params dictionary failed. ", + gf_fop_list[stub->fop], pgfid, loc->name, loc->path, + GF_PREOP_PARENT_KEY); + goto err; + } + + ret = dict_set_bin(local->params, conf->xattr_name, parent_disk_layout, + 4 * 4); + if (ret < 0) { + local->op_errno = -ret; + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "%s (%s/%s) (path: %s): " + "setting parent-layout in params dictionary failed. ", + gf_fop_list[stub->fop], pgfid, loc->name, loc->path); + goto err; + } + + parent_disk_layout = NULL; + local->hashed_subvol = hashed_subvol; + + local->current = &local->lock[0]; + ret = dht_protect_namespace(frame, loc, hashed_subvol, &local->current->ns, + dht_call_mkdir_stub); + if (ret < 0) + goto err; + + return 0; +err: + + if (parent_disk_layout != NULL) + GF_FREE(parent_disk_layout); + + if (parent_layout != NULL) + dht_layout_unref(this, parent_layout); + + return -1; +} int -dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) +dht_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *params) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int op_errno = -1; - int i = -1; + xlator_t *subvol = NULL; + int op_errno = -1; + int i = 0; + int ret = 0; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + conf = this->private; + + dht_get_du_info(frame, this, loc); + + local = dht_local_init(frame, loc, NULL, GF_FOP_MKNOD); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = dht_subvol_get_hashed(this, loc); + if (!subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + loc->path); + op_errno = EIO; + goto err; + } + + /* Post remove-brick, the client layout may not be in sync with + * disk layout because of lack of lookup. Hence,a mknod call + * may fall on the decommissioned brick. Hence, if the + * hashed_subvol is part of decommissioned bricks list, do a + * lookup on parent dir. If a fix-layout is already done by the + * remove-brick process, the parent directory layout will be in + * sync with that of the disk. If fix-layout is still ending + * on the parent directory, we can let the file get created on + * the decommissioned brick which will be eventually migrated to + * non-decommissioned brick based on the new layout. + */ + + if (conf->decommission_subvols_cnt) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i] && + conf->decommissioned_bricks[i] == subvol) { + gf_msg_debug(this->name, 0, + "hashed subvol:%s is " + "part of decommission brick list for " + "file: %s", + subvol->name, loc->path); + + /* dht_refresh_layout needs directory info in + * local->loc. Hence, storing the parent_loc in + * local->loc and storing the create context in + * local->loc2. We will restore this information + * in dht_creation do */ + + ret = loc_copy(&local->loc2, &local->loc); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "loc_copy failed %s", loc->path); + + goto err; + } + local->params = dict_ref(params); + local->rdev = rdev; + local->mode = mode; + local->umask = umask; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (this->private, err); + loc_wipe(&local->loc); - conf = this->private; + ret = dht_build_parent_loc(this, &local->loc, loc, &op_errno); - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_LOC_FAILED, + "parent loc build failed"); + goto err; + } - goto err; - } + ret = dht_mknod_lock(frame, subvol); - local->fd = fd_ref (fd); - local->call_cnt = conf->subvolume_cnt; + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR, + "locking parent failed"); + goto err; + } - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_fsyncdir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->fsyncdir, - fd, datasync); + goto done; + } } + } - return 0; + dht_mknod_wind_to_avail_subvol(frame, this, subvol, loc, rdev, mode, umask, + params); + +done: + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsyncdir, frame, -1, op_errno); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } - int -dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) +dht_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *params) { - call_frame_t *prev = NULL; - int ret = -1; - dht_local_t *local = NULL; + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); - if (op_ret == -1) - goto out; + local = dht_local_init(frame, loc, NULL, GF_FOP_SYMLINK); + if (!local) { + op_errno = ENOMEM; + goto err; + } - local = frame->local; - if (!local) { - op_ret = -1; - op_errno = EINVAL; - goto out; - } + subvol = dht_subvol_get_hashed(this, loc); + if (!subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + loc->path); + op_errno = EIO; + goto err; + } - prev = cookie; + gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name); - dht_itransform (this, prev->this, stbuf->ia_ino, &stbuf->ia_ino); - if (local->loc.parent) { - preparent->ia_ino = local->loc.parent->ino; - postparent->ia_ino = local->loc.parent->ino; + STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->symlink, linkname, loc, umask, params); - WIPE (preparent); - WIPE (postparent); - } + return 0; - ret = dht_layout_preset (this, prev->this, inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not set pre-set layout for subvolume %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto out; - } -out: - /* - * FIXME: ia_size and st_blocks of preparent and postparent do not have - * correct values. since, preparent and postparent buffers correspond - * to a directory these two members should have values equal to sum of - * corresponding values from each of the subvolume. - * See dht_iatt_merge for reference. - */ +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, stbuf, preparent, - postparent); - return 0; + return 0; } int -dht_mknod_linkfile_create_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) +dht_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { - dht_local_t *local = NULL; - xlator_t *cached_subvol = NULL; - - if (op_ret == -1) - goto err; + xlator_t *cached_subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_UNLINK); + if (!local) { + op_errno = ENOMEM; + + goto err; + } + + cached_subvol = local->cached_subvol; + if (!cached_subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + local->flags = xflag; + STACK_WIND_COOKIE(frame, dht_unlink_cbk, cached_subvol, cached_subvol, + cached_subvol->fops->unlink, loc, xflag, xdata); + + return 0; +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL); - local = frame->local; - cached_subvol = local->cached_subvol; + return 0; +} - STACK_WIND (frame, dht_newfile_cbk, - cached_subvol, cached_subvol->fops->mknod, - &local->loc, local->mode, local->rdev, - local->params); +static int +dht_remove_stale_linkto_cbk(int ret, call_frame_t *sync_frame, void *data) +{ + DHT_STACK_DESTROY(sync_frame); + return 0; +} - return 0; -err: - DHT_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL); - return 0; +static int +dht_remove_stale_linkto(void *data) +{ + call_frame_t *frame = NULL; + dht_local_t *local = NULL; + xlator_t *this = NULL; + dict_t *xdata_in = NULL; + int ret = 0; + + GF_VALIDATE_OR_GOTO("dht", data, out); + + frame = data; + local = frame->local; + this = frame->this; + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", local, out); + GF_VALIDATE_OR_GOTO("dht", local->link_subvol, out); + + xdata_in = dict_new(); + if (!xdata_in) + goto out; + + ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(xdata_in); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, 0, + "Failed to set keys for stale linkto" + "deletion on path %s", + local->loc.path); + goto out; + } + + ret = syncop_unlink(local->link_subvol, &local->loc, xdata_in, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, 0, + "Removal of linkto failed" + " on path %s at subvol %s", + local->loc.path, local->link_subvol->name); + } +out: + if (xdata_in) + dict_unref(xdata_in); + return ret; } -int -dht_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *params) +static int +dht_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - int ret = -1; - xlator_t *avail_subvol = NULL; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; + dht_local_t *local = NULL; + int ret = -1; + gf_boolean_t stbuf_merged = _gf_false; + xlator_t *subvol = NULL; + call_frame_t *cleanup_frame = NULL; + dht_local_t *cleanup_local = NULL; + + local = frame->local; + + if (op_ret == -1) { + /* Remove the linkto if exists */ + if (local->linked) { + cleanup_frame = create_frame(this, this->ctx->pool); + if (cleanup_frame) { + cleanup_local = dht_local_init(cleanup_frame, &local->loc2, + NULL, 0); + if (!cleanup_local || !local->link_subvol) { + DHT_STACK_DESTROY(cleanup_frame); + goto out; + } + cleanup_local->link_subvol = local->link_subvol; + FRAME_SU_DO(cleanup_frame, dht_local_t); + ret = synctask_new(this->ctx->env, dht_remove_stale_linkto, + dht_remove_stale_linkto_cbk, cleanup_frame, + cleanup_frame); + } + } + /* No continuation on DHT inode missing errors, as we should + * then have a good stbuf that states P2 happened. We would + * get inode missing if, the file completed migrated between + * the lookup and the link call */ + goto out; + } + + /* Update parent on success, even if P1/2 checks are positive. + * The second call on success will further update the parent */ + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0); + dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1); + } + + /* Update linkto attrs, if this is the first call and non-P2, + * if we detect P2 then we need to trust the attrs from the + * second call, not the first */ + if (local->linked == _gf_true && + ((local->call_cnt == 1 && !IS_DHT_MIGRATION_PHASE2(stbuf)) || + (local->call_cnt != 1 && IS_DHT_MIGRATION_PHASE2(&local->stbuf)))) { + dht_iatt_merge(this, &local->stbuf, stbuf); + stbuf_merged = _gf_true; + dht_linkfile_attr_heal(frame, this); + } + + /* No further P1/2 checks if we are in the second iteration of + * the call */ + if (local->call_cnt != 1) { + goto out; + } else { + /* Preserve the return values, in case the migration decides + * to recreate the link on the same subvol that the current + * hased for the link was created on. */ + dht_iatt_merge(this, &local->preparent, preparent); + dht_iatt_merge(this, &local->postparent, postparent); + if (!stbuf_merged) { + dht_iatt_merge(this, &local->stbuf, stbuf); + stbuf_merged = _gf_true; + } + + local->inode = inode_ref(inode); + } + + local->op_ret = op_ret; + local->op_errno = op_errno; + local->rebalance.target_op_fn = dht_link2; + dht_set_local_rebalance(this, local, stbuf, preparent, postparent, xdata); + + /* Check if the rebalance phase2 is true */ + if (IS_DHT_MIGRATION_PHASE2(stbuf)) { + ret = dht_inode_ctx_get_mig_info(this, local->loc.inode, NULL, &subvol); + if (!subvol) { + /* Phase 2 of migration */ + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } else { + dht_link2(this, subvol, frame, 0); + return 0; + } + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1(stbuf)) { + ret = dht_inode_ctx_get_mig_info(this, local->loc.inode, NULL, &subvol); + if (subvol) { + dht_link2(this, subvol, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } +out: + DHT_STRIP_PHASE1_FLAGS(stbuf); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); + dht_set_fixed_dir_stat(preparent); + dht_set_fixed_dir_stat(postparent); + DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, NULL); - conf = this->private; + return 0; +} - dht_get_du_info (frame, this, loc); +static int +dht_link2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int op_errno = EINVAL; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; + local = frame->local; + if (!local) + goto err; - goto err; - } + op_errno = local->op_errno; - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; + DHT_STACK_UNWIND(link, frame, local->op_ret, op_errno, local->inode, + &local->stbuf, &local->preparent, &local->postparent, + NULL); + return 0; + } - goto err; - } + if (subvol == NULL) { + op_errno = EINVAL; + goto err; + } - if (!dht_is_subvol_filled (this, subvol)) { - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); + /* Second call to create link file could result in EEXIST as the + * first call created the linkto in the currently + * migrating subvol, which could be the new hashed subvol */ + if (local->link_subvol == subvol) { + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); + DHT_STACK_UNWIND(link, frame, 0, 0, local->inode, &local->stbuf, + &local->preparent, &local->postparent, NULL); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, params); - } else { - avail_subvol = dht_free_disk_available_subvol (this, subvol); - if (avail_subvol != subvol) { - /* Choose the minimum filled volume, and create the - files there */ - - local->params = dict_ref (params); - local->cached_subvol = avail_subvol; - local->mode = mode; - local->rdev = rdev; - - dht_linkfile_create (frame, - dht_mknod_linkfile_create_cbk, - avail_subvol, subvol, loc); - } else { - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); + return 0; + } - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, params); - } - } + local->call_cnt = 2; - return 0; + STACK_WIND(frame, dht_link_cbk, subvol, subvol->fops->link, &local->loc, + &local->loc2, local->xattr_req); + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } - -int -dht_symlink (call_frame_t *frame, xlator_t *this, - const char *linkname, loc_t *loc, dict_t *params) +static int +dht_link_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - int ret = -1; - + dht_local_t *local = NULL; + xlator_t *srcvol = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); + if (op_ret == -1) + goto err; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - - goto err; - } + local = frame->local; + srcvol = local->linkfile.srcvol; - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } + STACK_WIND(frame, dht_link_cbk, srcvol, srcvol->fops->link, &local->loc, + &local->loc2, local->xattr_req); - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - gf_log (this->name, GF_LOG_TRACE, "Failed to copy loc"); - op_errno = ENOMEM; - goto err; - } + return 0; - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); +err: + DHT_STRIP_PHASE1_FLAGS(stbuf); + dht_set_fixed_dir_stat(preparent); + dht_set_fixed_dir_stat(postparent); + DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, xdata); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->symlink, - linkname, loc, params); + return 0; +} - return 0; +int +dht_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + xlator_t *cached_subvol = NULL; + xlator_t *hashed_subvol = NULL; + int op_errno = -1; + int ret = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(oldloc, err); + VALIDATE_OR_GOTO(newloc, err); + + local = dht_local_init(frame, oldloc, NULL, GF_FOP_LINK); + if (!local) { + op_errno = ENOMEM; + + goto err; + } + local->call_cnt = 1; + + cached_subvol = local->cached_subvol; + if (!cached_subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for path=%s", + oldloc->path); + op_errno = ENOENT; + goto err; + } + + hashed_subvol = dht_subvol_get_hashed(this, newloc); + if (!hashed_subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + newloc->path); + op_errno = EIO; + goto err; + } + + ret = loc_copy(&local->loc2, newloc); + if (ret == -1) { + op_errno = ENOMEM; + goto err; + } + if (xdata) + local->xattr_req = dict_ref(xdata); + + if (hashed_subvol != cached_subvol) { + gf_uuid_copy(local->gfid, oldloc->inode->gfid); + dht_linkfile_create(frame, dht_link_linkfile_cbk, this, cached_subvol, + hashed_subvol, newloc); + } else { + STACK_WIND(frame, dht_link_cbk, cached_subvol, + cached_subvol->fops->link, oldloc, newloc, xdata); + } + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (link, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } - int -dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) -{ - xlator_t *cached_subvol = NULL; - xlator_t *hashed_subvol = NULL; - int ret = -1; - int op_errno = -1; - dht_local_t *local = NULL; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - - if (dht_filter_loc_subvol_key (this, loc, &local->loc, - &cached_subvol)) { - gf_log (this->name, GF_LOG_NORMAL, - "unlinking %s on %s (given path %s)", - local->loc.path, cached_subvol->name, loc->path); - STACK_WIND (frame, dht_unlink_cbk, - cached_subvol, cached_subvol->fops->unlink, - &local->loc); - goto done; - } - - cached_subvol = dht_subvol_get_cached (this, loc->inode); - if (!cached_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } - - hashed_subvol = dht_subvol_get_hashed (this, loc); - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = EINVAL; - goto err; - } +dht_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + xlator_t *prev = NULL; + int ret = -1; + dht_local_t *local = NULL; + gf_boolean_t parent_layout_changed = _gf_false; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + xlator_t *subvol = NULL; + + local = frame->local; + + local = frame->local; + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + if (op_ret == -1) { + local->op_errno = op_errno; + parent_layout_changed = (xdata && + dict_get(xdata, GF_PREOP_CHECK_FAILED)) + ? _gf_true + : _gf_false; + + if (parent_layout_changed) { + if (local && local->lock[0].layout.parent_layout.locks) { + /* Returning failure as the layout could not be fixed even under + * the lock */ + goto out; + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; + gf_uuid_unparse(local->loc.parent->gfid, pgfid); + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_PARENT_LAYOUT_CHANGED, + "create (%s/%s) (path: %s): parent layout " + "changed. Attempting a layout refresh and then a " + "retry", + pgfid, local->loc.name, local->loc.path); - goto err; - } + /* + dht_refresh_layout needs directory info in local->loc.Hence, + storing the parent_loc in local->loc and storing the create + context in local->loc2. We will restore this information in + dht_creation_do. + */ - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = ENOMEM; + loc_wipe(&local->loc2); - goto err; - } + ret = loc_copy(&local->loc2, &local->loc); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "loc_copy failed %s", local->loc.path); - if (hashed_subvol != cached_subvol) { - STACK_WIND (frame, dht_unlink_linkfile_cbk, - hashed_subvol, hashed_subvol->fops->unlink, loc); - } else { - STACK_WIND (frame, dht_unlink_cbk, - cached_subvol, cached_subvol->fops->unlink, loc); - } -done: - return 0; -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL); + goto out; + } - return 0; -} + loc_wipe(&local->loc); + ret = dht_build_parent_loc(this, &local->loc, &local->loc2, + &op_errno); -int -dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) -{ - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - dht_local_t *local = NULL; + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_LOC_FAILED, + "parent loc build failed"); + goto out; + } - prev = cookie; - local = frame->local; + subvol = dht_subvol_get_hashed(this, &local->loc2); - if (op_ret == -1) + ret = dht_create_lock(frame, subvol); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR, + "locking parent failed"); goto out; + } - layout = dht_layout_for_subvol (this, prev->this); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no pre-set layout for subvolume %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto out; + return 0; } - stbuf->ia_ino = local->loc.inode->ino; + goto out; + } + + prev = cookie; + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0); + + dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1); + } - preparent->ia_ino = local->loc2.parent->ino; - postparent->ia_ino = local->loc2.parent->ino; + ret = dht_fd_ctx_set(this, fd, prev); + if (ret != 0) { + gf_msg_debug(this->name, 0, + "Possible fd leak. " + "Could not set fd ctx for subvol %s", + prev->name); + } - WIPE (preparent); - WIPE (postparent); + ret = dht_layout_preset(this, prev, inode); + if (ret != 0) { + gf_msg_debug(this->name, 0, "could not set preset layout for subvol %s", + prev->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + local->op_errno = op_errno; + + if (local->linked == _gf_true) { + local->stbuf = *stbuf; + dht_linkfile_attr_heal(frame, this); + } out: - DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent, - postparent); - return 0; + DHT_STRIP_PHASE1_FLAGS(stbuf); + dht_set_fixed_dir_stat(preparent); + dht_set_fixed_dir_stat(postparent); + + if (local && local->lock[0].layout.parent_layout.locks) { + /* store op_errno for failure case*/ + local->op_errno = op_errno; + local->refresh_layout_unlock(frame, this, op_ret, 1); + + if (op_ret == 0) { + DHT_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf, + preparent, postparent, xdata); + } + } else { + DHT_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf, + preparent, postparent, xdata); + } + return 0; } +static int +dht_create_linkfile_create_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *cached_subvol = NULL; + dht_conf_t *conf = NULL; + + local = frame->local; + if (!local) { + op_errno = EINVAL; + goto err; + } + + if (op_ret == -1) { + local->op_errno = op_errno; + goto err; + } + + conf = this->private; + if (!conf) { + local->op_errno = EINVAL; + op_errno = EINVAL; + goto err; + } + + cached_subvol = local->cached_subvol; + + if (local->params) { + dict_del(local->params, conf->link_xattr_name); + dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY); + } + + STACK_WIND_COOKIE(frame, dht_create_cbk, cached_subvol, cached_subvol, + cached_subvol->fops->create, &local->loc, local->flags, + local->mode, local->umask, local->fd, local->params); + + return 0; +err: + if (local && local->lock[0].layout.parent_layout.locks) { + local->refresh_layout_unlock(frame, this, -1, 1); + } else { + DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); + } + return 0; +} -int -dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) +static int +dht_create_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this, + xlator_t *subvol, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, + dict_t *params) { - dht_local_t *local = NULL; - xlator_t *srcvol = NULL; + dht_local_t *local = NULL; + xlator_t *avail_subvol = NULL; + local = frame->local; - if (op_ret == -1) - goto err; + if (!dht_is_subvol_filled(this, subvol)) { + gf_msg_debug(this->name, 0, "creating %s on %s", loc->path, + subvol->name); - local = frame->local; - srcvol = local->linkfile.srcvol; + dht_set_parent_layout_in_dict(loc, this, local); - STACK_WIND (frame, dht_link_cbk, - srcvol, srcvol->fops->link, - &local->loc, &local->loc2); + STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol, + subvol->fops->create, loc, flags, mode, umask, fd, + params); - return 0; + } else { + avail_subvol = dht_free_disk_available_subvol(this, subvol, local); -err: - DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent, - postparent); + if (avail_subvol != subvol) { + local->cached_subvol = avail_subvol; + local->hashed_subvol = subvol; - return 0; -} + gf_msg_debug(this->name, 0, "creating %s on %s (link at %s)", + loc->path, avail_subvol->name, subvol->name); + + dht_linkfile_create(frame, dht_create_linkfile_create_cbk, this, + avail_subvol, subvol, loc); + + goto out; + } + + gf_msg_debug(this->name, 0, "creating %s on %s", loc->path, + subvol->name); + dht_set_parent_layout_in_dict(loc, this, local); + + STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol, + subvol->fops->create, loc, flags, mode, umask, fd, + params); + } +out: + return 0; +} int -dht_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) +dht_build_parent_loc(xlator_t *this, loc_t *parent, loc_t *child, + int32_t *op_errno) { - xlator_t *cached_subvol = NULL; - xlator_t *hashed_subvol = NULL; - int op_errno = -1; - int ret = -1; - dht_local_t *local = NULL; + inode_table_t *table = NULL; + int ret = -1; + if (!parent || !child) { + if (op_errno) + *op_errno = EINVAL; + goto out; + } - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (oldloc, err); - VALIDATE_OR_GOTO (newloc, err); - - cached_subvol = dht_subvol_get_cached (this, oldloc->inode); - if (!cached_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", oldloc->path); - op_errno = EINVAL; - goto err; + if (child->parent) { + parent->inode = inode_ref(child->parent); + if (!parent->inode) { + if (op_errno) + *op_errno = EINVAL; + goto out; } - hashed_subvol = dht_subvol_get_hashed (this, newloc); - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - newloc->path); - op_errno = EINVAL; - goto err; - } + gf_uuid_copy(parent->gfid, child->pargfid); - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; + ret = 0; - goto err; + goto out; + } else { + if (gf_uuid_is_null(child->pargfid)) { + if (op_errno) + *op_errno = EINVAL; + goto out; } - ret = loc_copy (&local->loc, oldloc); - if (ret == -1) { - op_errno = ENOMEM; + table = this->itable; - goto err; + if (!table) { + if (op_errno) { + *op_errno = EINVAL; + goto out; + } } - ret = loc_copy (&local->loc2, newloc); - if (ret == -1) { - op_errno = ENOMEM; + parent->inode = inode_find(table, child->pargfid); - goto err; + if (!parent->inode) { + if (op_errno) { + *op_errno = EINVAL; + goto out; + } } - if (hashed_subvol != cached_subvol) { - memcpy (local->gfid, oldloc->inode->gfid, 16); - dht_linkfile_create (frame, dht_link_linkfile_cbk, - cached_subvol, hashed_subvol, newloc); - } else { - STACK_WIND (frame, dht_link_cbk, - cached_subvol, cached_subvol->fops->link, - oldloc, newloc); - } + gf_uuid_copy(parent->gfid, child->pargfid); - return 0; + ret = 0; + } + +out: + return ret; +} + +static int32_t +dht_create_do(call_frame_t *frame) +{ + dht_local_t *local = NULL; + dht_layout_t *refreshed = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; + + local = frame->local; + + this = THIS; + + conf = this->private; + + GF_VALIDATE_OR_GOTO(this->name, conf, err); + + methods = &(conf->methods); + + /* We don't need parent_loc anymore */ + loc_wipe(&local->loc); + + loc_copy(&local->loc, &local->loc2); + + loc_wipe(&local->loc2); + + refreshed = local->selfheal.refreshed_layout; + subvol = methods->layout_search(this, refreshed, local->loc.name); + + if (!subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "no subvolume in " + "layout for path=%s", + local->loc.path); + local->op_errno = ENOENT; + goto err; + } + + dht_create_wind_to_avail_subvol(frame, this, subvol, &local->loc, + local->flags, local->mode, local->umask, + local->fd, local->params); + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL); + local->refresh_layout_unlock(frame, this, -1, 1); - return 0; + return 0; } +static int32_t +dht_create_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + DHT_STACK_DESTROY(frame); + return 0; +} -int -dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - fd_t *fd, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) +static int32_t +dht_create_finish(call_frame_t *frame, xlator_t *this, int op_ret, + int invoke_cbk) { - call_frame_t *prev = NULL; - int ret = -1; - dht_local_t *local = NULL; + dht_local_t *local = NULL, *lock_local = NULL; + call_frame_t *lock_frame = NULL; + int lock_count = 0; + + local = frame->local; + lock_count = dht_lock_count(local->lock[0].layout.parent_layout.locks, + local->lock[0].layout.parent_layout.lk_count); + if (lock_count == 0) + goto done; + + lock_frame = copy_frame(frame); + if (lock_frame == NULL) { + goto done; + } + + lock_local = dht_local_init(lock_frame, &local->loc, NULL, + lock_frame->root->op); + if (lock_local == NULL) { + goto done; + } + + lock_local->lock[0] + .layout.parent_layout.locks = local->lock[0].layout.parent_layout.locks; + lock_local->lock[0].layout.parent_layout.lk_count = + local->lock[0].layout.parent_layout.lk_count; + + local->lock[0].layout.parent_layout.locks = NULL; + local->lock[0].layout.parent_layout.lk_count = 0; + + dht_unlock_inodelk(lock_frame, + lock_local->lock[0].layout.parent_layout.locks, + lock_local->lock[0].layout.parent_layout.lk_count, + dht_create_unlock_cbk); + lock_frame = NULL; - if (op_ret == -1) - goto out; +done: + if (lock_frame != NULL) { + DHT_STACK_DESTROY(lock_frame); + } - local = frame->local; - if (!local) { - op_ret = -1; - op_errno = EINVAL; - goto out; - } + if (op_ret == 0) + return 0; - prev = cookie; + DHT_STACK_UNWIND(create, frame, op_ret, local->op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); + return 0; +} - dht_itransform (this, prev->this, stbuf->ia_ino, &stbuf->ia_ino); - if (local->loc.parent) { - preparent->ia_ino = local->loc.parent->ino; - postparent->ia_ino = local->loc.parent->ino; +static int32_t +dht_create_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; - WIPE (preparent); - WIPE (postparent); - } + local = frame->local; - ret = dht_layout_preset (this, prev->this, inode); - if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not set preset layout for subvol %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto out; - } + if (!local) { + goto err; + } -out: - DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, stbuf, preparent, - postparent); - return 0; -} + if (op_ret < 0) { + gf_msg("DHT", GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR, + "Create lock failed for file: %s", local->loc2.name); + local->op_errno = op_errno; -int -dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) -{ - dht_local_t *local = NULL; - xlator_t *cached_subvol = NULL; + goto err; + } - if (op_ret == -1) - goto err; + local->refresh_layout_unlock = dht_create_finish; - local = frame->local; - cached_subvol = local->cached_subvol; + local->refresh_layout_done = dht_create_do; - STACK_WIND (frame, dht_create_cbk, - cached_subvol, cached_subvol->fops->create, - &local->loc, local->flags, local->mode, - local->fd, local->params); + dht_refresh_layout(frame); - return 0; + return 0; err: - DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - return 0; + if (local) + dht_create_finish(frame, this, -1, 0); + else + DHT_STACK_UNWIND(create, frame, -1, EINVAL, NULL, NULL, NULL, NULL, + NULL, NULL); + return 0; } -int -dht_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) +int32_t +dht_create_lock(call_frame_t *frame, xlator_t *subvol) { - int op_errno = -1; - int ret = -1; - xlator_t *subvol = NULL; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - xlator_t *avail_subvol = NULL; + dht_local_t *local = NULL; + int count = 1, ret = -1; + dht_lock_t **lk_array = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err); - conf = this->private; + local = frame->local; - dht_get_du_info (frame, this, loc); + lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer); - local = dht_local_init (frame); - if (!local) { + if (lk_array == NULL) + goto err; - op_errno = ENOMEM; - goto err; - } + lk_array[0] = dht_lock_new(frame->this, subvol, &local->loc, F_RDLCK, + DHT_LAYOUT_HEAL_DOMAIN, NULL, + IGNORE_ENOENT_ESTALE); - if (dht_filter_loc_subvol_key (this, loc, &local->loc, - &subvol)) { - gf_log (this->name, GF_LOG_NORMAL, - "creating %s on %s (got create on %s)", - local->loc.path, subvol->name, loc->path); - STACK_WIND (frame, dht_create_cbk, - subvol, subvol->fops->create, - &local->loc, flags, mode, fd, params); - goto done; - } + if (lk_array[0] == NULL) + goto err; - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = ENOMEM; + local->lock[0].layout.parent_layout.locks = lk_array; + local->lock[0].layout.parent_layout.lk_count = count; - goto err; - } - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } + ret = dht_blocking_inodelk(frame, lk_array, count, dht_create_lock_cbk); - if (!dht_is_subvol_filled (this, subvol)) { - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_create_cbk, - subvol, subvol->fops->create, - loc, flags, mode, fd, params); - goto done; - } - /* Choose the minimum filled volume, and create the - files there */ - /* TODO */ - avail_subvol = dht_free_disk_available_subvol (this, subvol); - if (avail_subvol != subvol) { - local->fd = fd_ref (fd); - local->params = dict_ref (params); - local->flags = flags; - local->mode = mode; - - local->cached_subvol = avail_subvol; - local->hashed_subvol = subvol; - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s (link at %s)", loc->path, - avail_subvol->name, subvol->name); - dht_linkfile_create (frame, - dht_create_linkfile_create_cbk, - avail_subvol, subvol, loc); - goto done; - } - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_create_cbk, - subvol, subvol->fops->create, - loc, flags, mode, fd, params); -done: - return 0; + if (ret < 0) { + local->lock[0].layout.parent_layout.locks = NULL; + local->lock[0].layout.parent_layout.lk_count = 0; + goto err; + } + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + if (lk_array != NULL) { + dht_lock_array_free(lk_array, count); + GF_FREE(lk_array); + } - return 0; + return -1; } - int -dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno) +dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local) { - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; + dht_conf_t *conf = this->private; + dht_layout_t *parent_layout = NULL; + int *parent_disk_layout = NULL; + xlator_t *hashed_subvol = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + int ret = 0; + + gf_uuid_unparse(loc->parent->gfid, pgfid); + + parent_layout = dht_layout_get(this, loc->parent); + hashed_subvol = dht_subvol_get_hashed(this, loc); + + ret = dht_disk_layout_extract_for_subvol(this, parent_layout, hashed_subvol, + &parent_disk_layout); + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "%s (%s/%s) (path: %s): " + "extracting in-memory layout of parent failed. ", + gf_fop_list[local->fop], pgfid, loc->name, loc->path); + goto err; + } + + ret = dict_set_str_sizen(local->params, GF_PREOP_PARENT_KEY, + conf->xattr_name); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "%s (%s/%s) (path: %s): " + "setting %s key in params dictionary failed. ", + gf_fop_list[local->fop], pgfid, loc->name, loc->path, + GF_PREOP_PARENT_KEY); + goto err; + } + + ret = dict_set_bin(local->params, conf->xattr_name, parent_disk_layout, + 4 * 4); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "%s (%s/%s) (path: %s): " + "setting parent-layout in params dictionary failed. ", + gf_fop_list[local->fop], pgfid, loc->name, loc->path); + goto err; + } +err: + dht_layout_unref(this, parent_layout); + return ret; +} - local = frame->local; - layout = local->selfheal.layout; +int +dht_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *params) +{ + int op_errno = -1; + xlator_t *subvol = NULL; + xlator_t *hashed_subvol = NULL; + dht_local_t *local = NULL; + int i = 0; + dht_conf_t *conf = NULL; + int ret = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + conf = this->private; + + dht_get_du_info(frame, this, loc); + + local = dht_local_init(frame, loc, fd, GF_FOP_CREATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->params = dict_ref(params); + local->flags = flags; + local->mode = mode; + local->umask = umask; + + if (dht_filter_loc_subvol_key(this, loc, &local->loc, &subvol)) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "creating %s on %s (got create on %s)", local->loc.path, + subvol->name, loc->path); + + /* Since lookup-optimize is enabled by default, we need + * to create the linkto file if required. + * Note this does not check for decommisioned bricks + * and min-free-disk limits as this is a debugging tool + * and not expected to be used in production. + */ + hashed_subvol = dht_subvol_get_hashed(this, &local->loc); - if (op_ret == 0) { - dht_layout_set (this, local->inode, layout); - local->stbuf.ia_ino = local->ia_ino; - if (local->loc.parent) { - local->preparent.ia_ino = local->loc.parent->ino; - local->postparent.ia_ino = local->loc.parent->ino; - - WIPE (&local->preparent); - WIPE (&local->postparent); - } - } + if (hashed_subvol && (hashed_subvol != subvol)) { + /* Create the linkto file and then the data file */ + local->cached_subvol = subvol; + local->hashed_subvol = hashed_subvol; - DHT_STACK_UNWIND (mkdir, frame, op_ret, op_errno, - local->inode, &local->stbuf, &local->preparent, - &local->postparent); + dht_linkfile_create(frame, dht_create_linkfile_create_cbk, this, + subvol, hashed_subvol, &local->loc); + goto done; + } + /* We either don't have a hashed subvol or the hashed subvol is + * the same as the one specified. No need to create the linkto + * file as we expect a lookup everywhere if there are problems + * with the parent layout + */ - return 0; -} + dht_set_parent_layout_in_dict(loc, this, local); + + STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol, + subvol->fops->create, &local->loc, flags, mode, umask, + fd, params); + goto done; + } + + subvol = dht_subvol_get_hashed(this, loc); + if (!subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "no subvolume in layout for path=%s", loc->path); + + op_errno = EIO; + goto err; + } + + /* Post remove-brick, the client layout may not be in sync with + * disk layout because of lack of lookup. Hence,a create call + * may fall on the decommissioned brick. Hence, if the + * hashed_subvol is part of decommissioned bricks list, do a + * lookup on parent dir. If a fix-layout is already done by the + * remove-brick process, the parent directory layout will be in + * sync with that of the disk. If fix-layout is still ending + * on the parent directory, we can let the file get created on + * the decommissioned brick which will be eventually migrated to + * non-decommissioned brick based on the new layout. + */ + + if (conf->decommission_subvols_cnt) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i] && + conf->decommissioned_bricks[i] == subvol) { + gf_msg_debug(this->name, 0, + "hashed subvol:%s is " + "part of decommission brick list for " + "file: %s", + subvol->name, loc->path); + + /* dht_refresh_layout needs directory info in + * local->loc. Hence, storing the parent_loc in + * local->loc and storing the create context in + * local->loc2. We will restore this information + * in dht_creation do */ + + ret = loc_copy(&local->loc2, &local->loc); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "loc_copy failed %s", loc->path); + + goto err; + } -int -dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - int ret = -1; - int subvol_filled = 0; - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - dht_conf_t *conf = NULL; - - conf = this->private; - local = frame->local; - prev = cookie; - layout = local->layout; + loc_wipe(&local->loc); - subvol_filled = dht_is_subvol_filled (this, prev->this); + ret = dht_build_parent_loc(this, &local->loc, loc, &op_errno); - LOCK (&frame->lock); - { - if (subvol_filled && (op_ret != -1)) { - ret = dht_layout_merge (this, layout, prev->this, - -1, ENOSPC, NULL); - } else { - ret = dht_layout_merge (this, layout, prev->this, - op_ret, op_errno, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_LOC_FAILED, + "parent loc build failed"); + goto err; } - if (op_ret == -1) { - local->op_errno = op_errno; - goto unlock; - } - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->preparent, preparent, prev->this); - dht_iatt_merge (this, &local->postparent, postparent, - prev->this); + ret = dht_create_lock(frame, subvol); - if (prev->this == dht_first_up_subvol (this)) { - local->ia_ino = local->stbuf.ia_ino; + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR, + "locking parent failed"); + goto err; } + goto done; + } } -unlock: - UNLOCK (&frame->lock); + } - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - dht_selfheal_new_directory (frame, dht_mkdir_selfheal_cbk, - layout); - } + dht_create_wind_to_avail_subvol(frame, this, subvol, loc, flags, mode, + umask, fd, params); +done: + return 0; - return 0; +err: + + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); + + return 0; } -int -dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) +static int +dht_mkdir_selfheal_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - int ret = -1; - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - dht_conf_t *conf = NULL; - int i = 0; - xlator_t *hashed_subvol = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; - VALIDATE_OR_GOTO (this->private, err); + local = frame->local; + layout = local->selfheal.layout; - local = frame->local; - prev = cookie; - layout = local->layout; - conf = this->private; - hashed_subvol = local->hashed_subvol; + FRAME_SU_UNDO(frame, dht_local_t); + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); - if (uuid_is_null (local->loc.inode->gfid) && !op_ret) - memcpy (local->loc.inode->gfid, stbuf->ia_gfid, 16); + if (op_ret == 0) { + dht_layout_set(this, local->inode, layout); - if (dht_is_subvol_filled (this, hashed_subvol)) - ret = dht_layout_merge (this, layout, prev->this, - -1, ENOSPC, NULL); - else - ret = dht_layout_merge (this, layout, prev->this, - op_ret, op_errno, NULL); + dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1); + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->preparent, 0); - if (op_ret == -1) { - local->op_errno = op_errno; - goto err; + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); } - local->op_ret = 0; - - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->preparent, preparent, prev->this); - dht_iatt_merge (this, &local->postparent, postparent, prev->this); + } - local->ia_ino = local->stbuf.ia_ino; + DHT_STACK_UNWIND(mkdir, frame, op_ret, op_errno, local->inode, + &local->stbuf, &local->preparent, &local->postparent, + NULL); - local->call_cnt = conf->subvolume_cnt - 1; + return 0; +} - if (local->call_cnt == 0) { - dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk, - &local->loc, layout); +static int +dht_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + int ret = -1; + gf_boolean_t subvol_filled = _gf_false; + gf_boolean_t dir_exists = _gf_false; + xlator_t *prev = NULL; + dht_layout_t *layout = NULL; + + local = frame->local; + prev = cookie; + layout = local->layout; + + subvol_filled = dht_is_subvol_filled(this, prev); + + LOCK(&frame->lock); + { + if (subvol_filled && (op_ret != -1)) { + ret = dht_layout_merge(this, layout, prev, -1, ENOSPC, NULL); + } else { + if (op_ret == -1 && op_errno == EEXIST) { + /* Very likely just a race between mkdir and + self-heal (from lookup of a concurrent mkdir + attempt). + Ignore error for now. layout setting will + anyways fail if this was a different (old) + pre-existing different directory. + */ + op_ret = 0; + dir_exists = _gf_true; + } + ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, NULL); } - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolumes[i] == hashed_subvol) - continue; - STACK_WIND (frame, dht_mkdir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->mkdir, - &local->loc, local->mode, local->params); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED, + "%s: failed to merge layouts for subvol %s", local->loc.path, + prev->name); + + if (op_ret == -1) { + local->op_errno = op_errno; + goto unlock; } - return 0; -err: - DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL); - return 0; -} + if (dir_exists) + goto unlock; -int -dht_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params) -{ - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int op_errno = -1; - int ret = -1; - xlator_t *hashed_subvol = NULL; + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->preparent, preparent); + dht_iatt_merge(this, &local->postparent, postparent); + } +unlock: + UNLOCK(&frame->lock); + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + /*Unlock entrylk and inodelk once mkdir is done on all subvols*/ + dht_unlock_namespace(frame, &local->lock[0]); + FRAME_SU_DO(frame, dht_local_t); + dht_selfheal_new_directory(frame, dht_mkdir_selfheal_cbk, layout); + } - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (this->private, err); + return 0; +} - conf = this->private; +static int +dht_mkdir_hashed_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); - dht_get_du_info (frame, this, loc); +static int +dht_mkdir_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *params) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1, ret = -1; + xlator_t *hashed_subvol = NULL; + int32_t *parent_disk_layout = NULL; + dht_layout_t *parent_layout = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + VALIDATE_OR_GOTO(this->private, err); + + gf_uuid_unparse(loc->parent->gfid, pgfid); + + conf = this->private; + local = frame->local; + + if (local->op_ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "mkdir (%s/%s) (path: %s): refreshing parent layout " + "failed.", + pgfid, loc->name, loc->path); + + op_errno = local->op_errno; + goto err; + } + + local->op_ret = -1; + + hashed_subvol = dht_subvol_get_hashed(this, loc); + if (hashed_subvol == NULL) { + gf_msg_debug(this->name, 0, + "mkdir (%s/%s) (path: %s): hashed subvol not " + "found", + pgfid, loc->name, loc->path); + op_errno = ENOENT; + goto err; + } + + local->hashed_subvol = hashed_subvol; + + parent_layout = dht_layout_get(this, loc->parent); + + ret = dht_disk_layout_extract_for_subvol(this, parent_layout, hashed_subvol, + &parent_disk_layout); + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, EIO, DHT_MSG_PARENT_LAYOUT_CHANGED, + "mkdir (%s/%s) (path: %s): " + "extracting in-memory layout of parent failed. ", + pgfid, loc->name, loc->path); + goto err; + } + + if (memcmp(local->parent_disk_layout, parent_disk_layout, + sizeof(local->parent_disk_layout)) == 0) { + gf_msg(this->name, GF_LOG_WARNING, EIO, DHT_MSG_PARENT_LAYOUT_CHANGED, + "mkdir (%s/%s) (path: %s): loop detected. " + "parent layout didn't change even though " + "previous attempt of mkdir failed because of " + "in-memory layout not matching with that on disk.", + pgfid, loc->name, loc->path); + op_errno = EIO; + goto err; + } + + memcpy((void *)local->parent_disk_layout, (void *)parent_disk_layout, + sizeof(local->parent_disk_layout)); + + dht_layout_unref(this, parent_layout); + parent_layout = NULL; + + ret = dict_set_str(params, GF_PREOP_PARENT_KEY, conf->xattr_name); + if (ret < 0) { + local->op_errno = -ret; + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "mkdir (%s/%s) (path: %s): " + "setting %s key in params dictionary failed. ", + pgfid, loc->name, loc->path, GF_PREOP_PARENT_KEY); + goto err; + } + + ret = dict_set_bin(params, conf->xattr_name, parent_disk_layout, 4 * 4); + if (ret < 0) { + local->op_errno = -ret; + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "setting parent-layout in params dictionary failed. " + "mkdir (%s/%s) (path: %s)", + pgfid, loc->name, loc->path); + goto err; + } + + parent_disk_layout = NULL; + + STACK_WIND_COOKIE(frame, dht_mkdir_hashed_cbk, hashed_subvol, hashed_subvol, + hashed_subvol->fops->mkdir, loc, mode, umask, params); + + return 0; - local = dht_local_init (frame); - if (!local) { +err: + dht_unlock_namespace(frame, &local->lock[0]); - op_errno = ENOMEM; - goto err; - } + op_errno = local ? local->op_errno : op_errno; + DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - hashed_subvol = dht_subvol_get_hashed (this, loc); + if (parent_disk_layout != NULL) + GF_FREE(parent_disk_layout); - if (hashed_subvol == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "hashed subvol not found for %s", - loc->path); - op_errno = EINVAL; - goto err; - } + if (parent_layout != NULL) + dht_layout_unref(this, parent_layout); - local->hashed_subvol = hashed_subvol; - local->inode = inode_ref (loc->inode); - ret = loc_copy (&local->loc, loc); - local->mode = mode; + return 0; +} - if (ret == -1) { +static int +dht_mkdir_hashed_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + int ret = -1; + xlator_t *prev = NULL; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *hashed_subvol = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + gf_boolean_t parent_layout_changed = _gf_false; + call_stub_t *stub = NULL; + + local = frame->local; + prev = cookie; + layout = local->layout; + conf = this->private; + hashed_subvol = local->hashed_subvol; + + gf_uuid_unparse(local->loc.parent->gfid, pgfid); + + if (gf_uuid_is_null(local->loc.gfid) && !op_ret) + gf_uuid_copy(local->loc.gfid, stbuf->ia_gfid); + + if (op_ret == -1) { + local->op_errno = op_errno; + + parent_layout_changed = (xdata && + dict_get(xdata, GF_PREOP_CHECK_FAILED)) + ? 1 + : 0; + if (parent_layout_changed) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_PARENT_LAYOUT_CHANGED, + "mkdir (%s/%s) (path: %s): parent layout " + "changed. Attempting a refresh and then a " + "retry", + pgfid, local->loc.name, local->loc.path); + + stub = fop_mkdir_stub(frame, dht_mkdir_helper, &local->loc, + local->mode, local->umask, local->params); + if (stub == NULL) { + goto err; + } - op_errno = ENOMEM; + ret = dht_handle_parent_layout_change(this, stub); + if (ret) { goto err; - } + } + + stub = NULL; + + return 0; + } + + goto err; + } + + dict_del(local->params, GF_PREOP_PARENT_KEY); + dict_del(local->params, conf->xattr_name); + + if (dht_is_subvol_filled(this, hashed_subvol)) + ret = dht_layout_merge(this, layout, prev, -1, ENOSPC, NULL); + else + ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, NULL); + + /* TODO: we may have to return from the function + if layout merge fails. For now, lets just log an error */ + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED, + "%s: failed to merge layouts for subvol %s", local->loc.path, + prev->name); + + local->op_ret = 0; + + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->preparent, preparent); + dht_iatt_merge(this, &local->postparent, postparent); + + local->call_cnt = conf->subvolume_cnt - 1; + /* Delete internal mds xattr from params dict to avoid store + internal mds xattr on other subvols + */ + dict_del(local->params, conf->mds_xattr_key); + + if (gf_uuid_is_null(local->loc.gfid)) + gf_uuid_copy(local->loc.gfid, stbuf->ia_gfid); + + /* Set hashed subvol as a mds subvol on inode ctx */ + /*if (!local->inode) + local->inode = inode_ref (inode); + */ + ret = dht_inode_ctx_mdsvol_set(local->inode, this, hashed_subvol); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "Failed to set hashed subvol for %s on inode vol is %s", + local->loc.path, hashed_subvol->name); + } + + if (local->call_cnt == 0) { + /*Unlock namespace lock once mkdir is done on all subvols*/ + dht_unlock_namespace(frame, &local->lock[0]); + FRAME_SU_DO(frame, dht_local_t); + dht_selfheal_directory(frame, dht_mkdir_selfheal_cbk, &local->loc, + layout); + return 0; + } + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == hashed_subvol) + continue; + STACK_WIND_COOKIE(frame, dht_mkdir_cbk, conf->subvolumes[i], + conf->subvolumes[i], conf->subvolumes[i]->fops->mkdir, + &local->loc, local->mode, local->umask, + local->params); + } + + return 0; +err: + if (local->op_ret != 0) { + dht_unlock_namespace(frame, &local->lock[0]); + } - local->params = dict_ref (params); + DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - local->layout = dht_layout_new (this, conf->subvolume_cnt); - if (!local->layout) { + return 0; +} - op_errno = ENOMEM; - goto err; - } +static int +dht_mkdir_guard_parent_layout_cbk(call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, mode_t umask, + dict_t *params) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = 0; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + int ret = -1; + int32_t zero[1] = {0}; + + local = frame->local; + conf = this->private; + + gf_uuid_unparse(loc->parent->gfid, pgfid); + + if (local->op_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "mkdir (%s/%s) (path: %s): " + "Acquiring lock on parent to guard against " + "layout-change failed.", + pgfid, loc->name, loc->path); + goto err; + } + + local->op_ret = -1; + /* Add internal MDS xattr on disk for hashed subvol + */ + ret = dht_dict_set_array(params, conf->mds_xattr_key, zero, 1); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value:key = %s for " + "path %s", + conf->mds_xattr_key, loc->path); + } + + STACK_WIND_COOKIE(frame, dht_mkdir_hashed_cbk, local->hashed_subvol, + local->hashed_subvol, local->hashed_subvol->fops->mkdir, + loc, mode, umask, params); + + return 0; +err: + DHT_STACK_UNWIND(mkdir, frame, -1, local->op_errno, NULL, NULL, NULL, NULL, + NULL); - STACK_WIND (frame, dht_mkdir_hashed_cbk, - hashed_subvol, - hashed_subvol->fops->mkdir, - loc, mode, params); + return 0; +} - return 0; +int +dht_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *params) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = EINVAL, ret = -1; + xlator_t *hashed_subvol = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + call_stub_t *stub = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + VALIDATE_OR_GOTO(this->private, err); + + gf_uuid_unparse(loc->parent->gfid, pgfid); + + conf = this->private; + + if (!params || !dict_get(params, "gfid-req")) { + op_errno = EPERM; + gf_msg_callingfn(this->name, GF_LOG_WARNING, op_errno, + DHT_MSG_GFID_NULL, + "mkdir: %s is received " + "without gfid-req %p", + loc->path, params); + goto err; + } + + dht_get_du_info(frame, this, loc); + + local = dht_local_init(frame, loc, NULL, GF_FOP_MKDIR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + hashed_subvol = dht_subvol_get_hashed(this, loc); + if (hashed_subvol == NULL) { + gf_msg_debug(this->name, 0, "hashed subvol not found for %s", + loc->path); + local->op_errno = EIO; + goto err; + } + + local->hashed_subvol = hashed_subvol; + local->mode = mode; + local->umask = umask; + if (params) + local->params = dict_ref(params); + + local->inode = inode_ref(loc->inode); + + local->layout = dht_layout_new(this, conf->subvolume_cnt); + if (!local->layout) { + op_errno = ENOMEM; + goto err; + } + + /* set the newly created directory hash to the commit hash + * if the configuration option is set. If configuration option + * is not set, the older clients may still be connecting to the + * volume and hence we need to preserve the 1 in disk[0] part of the + * layout xattr */ + if (conf->lookup_optimize) + local->layout->commit_hash = conf->vol_commit_hash; + else + local->layout->commit_hash = DHT_LAYOUT_HASH_INVALID; + + stub = fop_mkdir_stub(frame, dht_mkdir_guard_parent_layout_cbk, loc, mode, + umask, params); + if (stub == NULL) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "mkdir (%s/%s) (path: %s): " + "creating stub failed.", + pgfid, loc->name, loc->path); + local->op_errno = ENOMEM; + goto err; + } + + ret = dht_guard_parent_layout_and_namespace(this, stub); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_PARENT_LAYOUT_CHANGED, + "mkdir (%s/%s) (path: %s) cannot wind lock request to " + "guard parent layout", + pgfid, loc->name, loc->path); + goto err; + } + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL); + op_errno = local ? local->op_errno : op_errno; + DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } - -int -dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) +static int +dht_rmdir_selfheal_cbk(call_frame_t *heal_frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; + dht_local_t *heal_local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; + heal_local = heal_frame->local; + main_frame = heal_local->main_frame; + local = main_frame->local; - if (local->loc.parent) { - local->preparent.ia_ino = local->loc.parent->ino; - local->postparent.ia_ino = local->loc.parent->ino; - } + DHT_STACK_DESTROY(heal_frame); + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); - DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent); + DHT_STACK_UNWIND(rmdir, main_frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, NULL); - return 0; + return 0; } - -int -dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent) +static int +dht_rmdir_hashed_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - local->op_ret = -1; - - if (op_errno != ENOENT) - local->need_selfheal = 1; - - gf_log (this->name, GF_LOG_DEBUG, - "rmdir on %s for %s failed (%s)", - prev->this->name, local->loc.path, - strerror (op_errno)); - goto unlock; + dht_local_t *local = NULL; + dht_local_t *heal_local = NULL; + call_frame_t *heal_frame = NULL; + dht_conf_t *conf = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + prev = cookie; + conf = this->private; + + gf_uuid_unparse(local->loc.gfid, gfid); + + LOCK(&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + local->op_ret = -1; + if (conf->subvolume_cnt != 1) { + if (op_errno != ENOENT && op_errno != EACCES && + op_errno != ESTALE) { + local->need_selfheal = 1; } + } - dht_iatt_merge (this, &local->preparent, preparent, prev->this); - dht_iatt_merge (this, &local->postparent, postparent, - prev->this); + gf_msg_debug(this->name, op_errno, + "rmdir on %s for %s failed " + "(gfid = %s)", + prev->name, local->loc.path, gfid); + goto unlock; } -unlock: - UNLOCK (&frame->lock); + dht_iatt_merge(this, &local->preparent, preparent); + dht_iatt_merge(this, &local->postparent, postparent); + } +unlock: + UNLOCK(&frame->lock); - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - if (local->need_selfheal) { - local->layout = - dht_layout_get (this, local->loc.inode); + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + if (local->need_selfheal) { + dht_rmdir_unlock(frame, this); + local->layout = dht_layout_get(this, local->loc.inode); - /* TODO: neater interface needed below */ - local->stbuf.ia_type = local->loc.inode->ia_type; + /* TODO: neater interface needed below */ + local->stbuf.ia_type = local->loc.inode->ia_type; - dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk, - &local->loc, local->layout); - } else { - if (local->loc.parent) { - local->preparent.ia_ino = - local->loc.parent->ino; - local->postparent.ia_ino = - local->loc.parent->ino; - - WIPE (&local->preparent); - WIPE (&local->postparent); - } + gf_uuid_copy(local->gfid, local->loc.inode->gfid); - DHT_STACK_UNWIND (rmdir, frame, local->op_ret, - local->op_errno, &local->preparent, - &local->postparent); - } - } + /* Use a different frame or else the rmdir op_ret is + * overwritten by that of the selfheal */ - return 0; -} + heal_frame = copy_frame(frame); + if (heal_frame == NULL) { + goto err; + } -int -dht_rmdir_do (call_frame_t *frame, xlator_t *this) -{ - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int i = 0; + heal_local = dht_local_init(heal_frame, &local->loc, NULL, 0); + if (!heal_local) { + DHT_STACK_DESTROY(heal_frame); + goto err; + } - VALIDATE_OR_GOTO (this->private, err); + heal_local->inode = inode_ref(local->loc.inode); + heal_local->main_frame = frame; + gf_uuid_copy(heal_local->gfid, local->loc.inode->gfid); - conf = this->private; - local = frame->local; + dht_selfheal_restore(heal_frame, dht_rmdir_selfheal_cbk, + &heal_local->loc, heal_local->layout); + return 0; + } else { + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->preparent, 0); - if (local->op_ret == -1) - goto err; + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); + } - local->call_cnt = conf->subvolume_cnt; + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_rmdir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->rmdir, - &local->loc, local->flags); + dht_rmdir_unlock(frame, this); + DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, NULL); } + } - return 0; + return 0; err: - DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent); - return 0; + DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno, NULL, NULL, + NULL); + return 0; } +static int +dht_rmdir_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + DHT_STACK_DESTROY(frame); + return 0; +} -int -dht_rmdir_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent) +static int +dht_rmdir_unlock(call_frame_t *frame, xlator_t *this) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - xlator_t *src = NULL; - call_frame_t *main_frame = NULL; - dht_local_t *main_local = NULL; - int this_call_cnt = 0; + dht_local_t *local = NULL, *lock_local = NULL; + call_frame_t *lock_frame = NULL; + int lock_count = 0; - local = frame->local; - prev = cookie; - src = prev->this; + local = frame->local; - main_frame = local->main_frame; - main_local = main_frame->local; + /* Unlock entrylk */ + dht_unlock_entrylk_wrapper(frame, &local->lock[0].ns.directory_ns); - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "unlinked linkfile %s on %s", - local->loc.path, src->name); - } else { - main_local->op_ret = -1; - main_local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "unlink of %s on %s failed (%s)", - local->loc.path, src->name, strerror (op_errno)); - } + /* Unlock inodelk */ + lock_count = dht_lock_count(local->lock[0].ns.parent_layout.locks, + local->lock[0].ns.parent_layout.lk_count); - this_call_cnt = dht_frame_return (main_frame); - if (is_last_call (this_call_cnt)) - dht_rmdir_do (main_frame, this); + if (lock_count == 0) + goto done; - DHT_STACK_DESTROY (frame); - return 0; -} + lock_frame = copy_frame(frame); + if (lock_frame == NULL) + goto done; + lock_local = dht_local_init(lock_frame, &local->loc, NULL, + lock_frame->root->op); + if (lock_local == NULL) + goto done; -int -dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, - struct iatt *stbuf, dict_t *xattr, struct iatt *parent) -{ - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - xlator_t *src = NULL; - call_frame_t *main_frame = NULL; - dht_local_t *main_local = NULL; - int this_call_cnt = 0; + lock_local->lock[0].ns.parent_layout.locks = local->lock[0] + .ns.parent_layout.locks; + lock_local->lock[0] + .ns.parent_layout.lk_count = local->lock[0].ns.parent_layout.lk_count; - local = frame->local; - prev = cookie; - src = prev->this; + local->lock[0].ns.parent_layout.locks = NULL; + local->lock[0].ns.parent_layout.lk_count = 0; + dht_unlock_inodelk(lock_frame, lock_local->lock[0].ns.parent_layout.locks, + lock_local->lock[0].ns.parent_layout.lk_count, + dht_rmdir_unlock_cbk); + lock_frame = NULL; - main_frame = local->main_frame; - main_local = main_frame->local; +done: + if (lock_frame != NULL) { + DHT_STACK_DESTROY(lock_frame); + } - if (op_ret != 0) - goto err; + return 0; +} - if (check_is_linkfile (inode, stbuf, xattr) == 0) { - main_local->op_ret = -1; - main_local->op_errno = ENOTEMPTY; +static int +dht_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + int done = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + dht_local_t *heal_local = NULL; + call_frame_t *heal_frame = NULL; + int ret = -1; + + local = frame->local; + prev = cookie; + + LOCK(&frame->lock); + { + if (op_ret == -1) { + if ((op_errno != ENOENT) && (op_errno != ESTALE)) { + local->op_errno = op_errno; + local->op_ret = -1; - gf_log (this->name, GF_LOG_WARNING, - "%s on %s found to be not a linkfile (type=0%o)", - local->loc.path, src->name, stbuf->ia_type); - goto err; + if (op_errno != EACCES) + local->need_selfheal = 1; + } + + gf_uuid_unparse(local->loc.gfid, gfid); + + gf_msg_debug(this->name, op_errno, + "rmdir on %s for %s failed." + "(gfid = %s)", + prev->name, local->loc.path, gfid); + goto unlock; } - STACK_WIND (frame, dht_rmdir_linkfile_unlink_cbk, - src, src->fops->unlink, &local->loc); - return 0; -err: + /* Track if rmdir succeeded on at least one subvol*/ + local->fop_succeeded = 1; + dht_iatt_merge(this, &local->preparent, preparent); + dht_iatt_merge(this, &local->postparent, postparent); + } +unlock: + UNLOCK(&frame->lock); - this_call_cnt = dht_frame_return (main_frame); - if (is_last_call (this_call_cnt)) - dht_rmdir_do (main_frame, this); + this_call_cnt = dht_frame_return(frame); - DHT_STACK_DESTROY (frame); - return 0; -} + /* if local->hashed_subvol, we are yet to wind to hashed_subvol. */ + if (local->hashed_subvol && (this_call_cnt == 1)) { + done = 1; + } else if (!local->hashed_subvol && !this_call_cnt) { + done = 1; + } + if (done) { + if (local->need_selfheal && local->fop_succeeded) { + dht_rmdir_unlock(frame, this); + local->layout = dht_layout_get(this, local->loc.inode); -int -dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, - gf_dirent_t *entries, xlator_t *src) -{ - int ret = 0; - int build_ret = 0; - gf_dirent_t *trav = NULL; - call_frame_t *lookup_frame = NULL; - dht_local_t *lookup_local = NULL; - dht_local_t *local = NULL; + /* TODO: neater interface needed below */ + local->stbuf.ia_type = local->loc.inode->ia_type; - local = frame->local; + gf_uuid_copy(local->gfid, local->loc.inode->gfid); + heal_frame = copy_frame(frame); + if (heal_frame == NULL) { + goto err; + } - list_for_each_entry (trav, &entries->list, list) { - if (strcmp (trav->d_name, ".") == 0) - continue; - if (strcmp (trav->d_name, "..") == 0) - continue; - if (check_is_linkfile (NULL, (&trav->d_stat), NULL) == 1) { - ret++; - continue; - } + heal_local = dht_local_init(heal_frame, &local->loc, NULL, 0); + if (!heal_local) { + DHT_STACK_DESTROY(heal_frame); + goto err; + } + + heal_local->inode = inode_ref(local->loc.inode); + heal_local->main_frame = frame; + gf_uuid_copy(heal_local->gfid, local->loc.inode->gfid); + ret = dht_selfheal_restore(heal_frame, dht_rmdir_selfheal_cbk, + &heal_local->loc, heal_local->layout); + if (ret) { + DHT_STACK_DESTROY(heal_frame); + goto err; + } + + } else if (this_call_cnt) { + /* If non-hashed subvol's have responded, proceed */ + if (local->op_ret == 0) { + /* Delete the dir from the hashed subvol if: + * The fop succeeded on at least one subvol + * and did not fail on any + * or + * The fop failed with ENOENT/ESTALE on + * all subvols */ + + STACK_WIND_COOKIE(frame, dht_rmdir_hashed_subvol_cbk, + local->hashed_subvol, local->hashed_subvol, + local->hashed_subvol->fops->rmdir, + &local->loc, local->flags, NULL); + } else { + /* hashed-subvol was non-NULL and rmdir failed on + * all non hashed-subvols. Unwind rmdir with + * local->op_ret and local->op_errno. */ + dht_rmdir_unlock(frame, this); + DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, NULL); - /* this entry is either a directory which is neither "." nor "..", - or a non directory which is not a linkfile. the directory is to - be treated as non-empty - */ return 0; + } + } else if (!this_call_cnt) { + /* All subvol's have responded, proceed */ + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->preparent, 0); + + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); + } + + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); + + dht_rmdir_unlock(frame, this); + DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, NULL); } + } - list_for_each_entry (trav, &entries->list, list) { - if (strcmp (trav->d_name, ".") == 0) - continue; - if (strcmp (trav->d_name, "..") == 0) - continue; + return 0; - lookup_frame = NULL; - lookup_local = NULL; +err: + DHT_STACK_UNWIND(rmdir, frame, -1, local->op_errno, NULL, NULL, NULL); + return 0; +} - lookup_frame = copy_frame (frame); - if (!lookup_frame) { - /* out of memory, let the rmdir fail - (as non-empty, unfortunately) */ - goto err; - } +static int +dht_rmdir_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *hashed_subvol; - lookup_local = GF_CALLOC (sizeof (*local), 1, - gf_dht_mt_dht_local_t); - if (!lookup_local) { - goto err; - } + conf = this->private; + local = frame->local; - lookup_frame->local = lookup_local; - lookup_local->main_frame = frame; + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR, + "acquiring entrylk after inodelk failed rmdir for %s)", + local->loc.path); - build_ret = dht_build_child_loc (this, &lookup_local->loc, - &local->loc, trav->d_name); - if (build_ret != 0) - goto err; + local->op_ret = -1; + local->op_errno = op_errno; + goto err; + } - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", - lookup_local->loc.path, src->name); + hashed_subvol = local->hashed_subvol; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (hashed_subvol && (hashed_subvol == conf->subvolumes[i])) + continue; - LOCK (&frame->lock); - { - local->call_cnt++; - } - UNLOCK (&frame->lock); + STACK_WIND_COOKIE(frame, dht_rmdir_cbk, conf->subvolumes[i], + conf->subvolumes[i], conf->subvolumes[i]->fops->rmdir, + &local->loc, local->flags, NULL); + } - STACK_WIND (lookup_frame, dht_rmdir_lookup_cbk, - src, src->fops->lookup, - &lookup_local->loc, NULL); - ret++; - } + return 0; - return ret; err: - DHT_STACK_DESTROY (lookup_frame); - return 0; -} + DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, NULL); + return 0; +} -int -dht_rmdir_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries) +static int +dht_rmdir_do(call_frame_t *frame, xlator_t *this) { - dht_local_t *local = NULL; - int this_call_cnt = -1; - call_frame_t *prev = NULL; - xlator_t *src = NULL; - int ret = 0; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + xlator_t *hashed_subvol = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; - local = frame->local; - prev = cookie; - src = prev->this; - - if (op_ret > 2) { - ret = dht_rmdir_is_subvol_empty (frame, this, entries, src); - - switch (ret) { - case 0: /* non linkfiles exist */ - gf_log (this->name, GF_LOG_TRACE, - "readdir on %s for %s returned %d entries", - prev->this->name, local->loc.path, op_ret); - local->op_ret = -1; - local->op_errno = ENOTEMPTY; - break; - default: - /* @ret number of linkfiles are getting unlinked */ - gf_log (this->name, GF_LOG_TRACE, - "readdir on %s for %s found %d linkfiles", - prev->this->name, local->loc.path, ret); - break; - } - } + VALIDATE_OR_GOTO(frame->local, err); + local = frame->local; + VALIDATE_OR_GOTO(this->private, out); + conf = this->private; - this_call_cnt = dht_frame_return (frame); + if (local->op_ret == -1) + goto out; - if (is_last_call (this_call_cnt)) { - dht_rmdir_do (frame, this); - } + local->call_cnt = conf->subvolume_cnt; + + /* first remove from non-hashed_subvol */ + hashed_subvol = dht_subvol_get_hashed(this, &local->loc); + + if (!hashed_subvol) { + gf_uuid_unparse(local->loc.gfid, gfid); + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Failed to get hashed subvol for %s (gfid = %s)", + local->loc.path, gfid); + } else { + local->hashed_subvol = hashed_subvol; + } + + /* When DHT has only 1 child */ + if (conf->subvolume_cnt == 1) { + STACK_WIND_COOKIE(frame, dht_rmdir_hashed_subvol_cbk, + conf->subvolumes[0], conf->subvolumes[0], + conf->subvolumes[0]->fops->rmdir, &local->loc, + local->flags, NULL); return 0; + } + + local->current = &local->lock[0]; + ret = dht_protect_namespace(frame, &local->loc, local->hashed_subvol, + &local->current->ns, dht_rmdir_lock_cbk); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = errno ? errno : EINVAL; + goto out; + } + + return 0; + +out: + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); + + DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, NULL); + return 0; +err: + DHT_STACK_UNWIND(rmdir, frame, -1, EINVAL, NULL, NULL, NULL); + return 0; } +static void +dht_rmdir_readdirp_done(call_frame_t *readdirp_frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + dht_local_t *main_local = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + + local = readdirp_frame->local; + main_frame = local->main_frame; + main_local = main_frame->local; + + /* At least one readdirp failed. + * This is a bit hit or miss - if readdirp failed on more than + * one subvol, we don't know which error is returned. + */ + if (local->op_ret == -1) { + main_local->op_ret = local->op_ret; + main_local->op_errno = local->op_errno; + } + + this_call_cnt = dht_frame_return(main_frame); + + if (is_last_call(this_call_cnt)) + dht_rmdir_do(main_frame, this); + + DHT_STACK_DESTROY(readdirp_frame); +} -int -dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) +/* Keep sending readdirp on the subvol until it returns no more entries + * It is possible that not all entries will fit in a single readdirp in + * which case the rmdir will keep failing with ENOTEMPTY + */ + +static int +dht_rmdir_readdirp_do(call_frame_t *readdirp_frame, xlator_t *this) { - dht_local_t *local = NULL; - int this_call_cnt = -1; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; + local = readdirp_frame->local; - local = frame->local; - prev = cookie; + if (local->op_ret == -1) { + /* there is no point doing another readdirp on this + * subvol . */ + dht_rmdir_readdirp_done(readdirp_frame, this); + return 0; + } - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "opendir on %s for %s failed (%s)", - prev->this->name, local->loc.path, - strerror (op_errno)); - goto err; - } + STACK_WIND_COOKIE(readdirp_frame, dht_rmdir_readdirp_cbk, + local->hashed_subvol, local->hashed_subvol, + local->hashed_subvol->fops->readdirp, local->fd, 4096, 0, + local->xattr); - STACK_WIND (frame, dht_rmdir_readdirp_cbk, - prev->this, prev->this->fops->readdirp, - local->fd, 4096, 0); + return 0; +} - return 0; +static int +dht_rmdir_linkfile_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + xlator_t *src = NULL; + call_frame_t *readdirp_frame = NULL; + dht_local_t *readdirp_local = NULL; + int this_call_cnt = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + prev = cookie; + src = prev; + + readdirp_frame = local->main_frame; + readdirp_local = readdirp_frame->local; + + gf_uuid_unparse(local->loc.gfid, gfid); + + if (op_ret == 0) { + gf_msg_trace(this->name, 0, "Unlinked linkfile %s on %s, gfid = %s", + local->loc.path, src->name, gfid); + } else { + if (op_errno != ENOENT) { + readdirp_local->op_ret = -1; + readdirp_local->op_errno = op_errno; + } + gf_msg_debug(this->name, op_errno, + "Unlink of %s on %s failed. (gfid = %s)", local->loc.path, + src->name, gfid); + } + + this_call_cnt = dht_frame_return(readdirp_frame); + + if (is_last_call(this_call_cnt)) + dht_rmdir_readdirp_do(readdirp_frame, this); + + DHT_STACK_DESTROY(frame); + return 0; +} +static int +dht_rmdir_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, struct iatt *parent) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + xlator_t *src = NULL; + call_frame_t *readdirp_frame = NULL; + dht_local_t *readdirp_local = NULL; + int this_call_cnt = 0; + dht_conf_t *conf = this->private; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + prev = cookie; + src = prev; + + gf_msg_debug(this->name, 0, "dht_rmdir_lookup_cbk %s", local->loc.path); + + readdirp_frame = local->main_frame; + readdirp_local = readdirp_frame->local; + + if (op_ret != 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_FILE_LOOKUP_FAILED, + "lookup failed for %s on %s", local->loc.path, src->name); + goto err; + } + + if (!check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name)) { + readdirp_local->op_ret = -1; + readdirp_local->op_errno = ENOTEMPTY; + + gf_uuid_unparse(local->loc.gfid, gfid); + + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NOT_LINK_FILE_ERROR, + "%s on %s is not a linkfile (type=0%o, gfid = %s)", + local->loc.path, src->name, stbuf->ia_type, gfid); + goto err; + } + + STACK_WIND_COOKIE(frame, dht_rmdir_linkfile_unlink_cbk, src, src, + src->fops->unlink, &local->loc, 0, NULL); + return 0; err: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - dht_rmdir_do (frame, this); - } + this_call_cnt = dht_frame_return(readdirp_frame); + if (is_last_call(this_call_cnt)) { + dht_rmdir_readdirp_do(readdirp_frame, this); + } - return 0; + DHT_STACK_DESTROY(frame); + return 0; } - -int -dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) +static int +dht_rmdir_cached_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *parent) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int op_errno = -1; - int i = -1; - int ret = -1; + dht_local_t *local = NULL; + xlator_t *src = NULL; + call_frame_t *readdirp_frame = NULL; + dht_local_t *readdirp_local = NULL; + int this_call_cnt = 0; + dht_conf_t *conf = this->private; + dict_t *xattrs = NULL; + int ret = 0; + + local = frame->local; + src = local->hashed_subvol; + + /* main_frame here is the readdirp_frame */ + + readdirp_frame = local->main_frame; + readdirp_local = readdirp_frame->local; + + gf_msg_debug(this->name, 0, "returning for %s ", local->loc.path); + + if (op_ret == 0) { + readdirp_local->op_ret = -1; + readdirp_local->op_errno = ENOTEMPTY; + + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_ERROR, + "%s found on cached subvol %s", local->loc.path, src->name); + goto err; + } else if (op_errno != ENOENT) { + readdirp_local->op_ret = -1; + readdirp_local->op_errno = op_errno; + + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_SUBVOL_ERROR, + "%s not found on cached subvol %s", local->loc.path, src->name); + goto err; + } + + xattrs = dict_new(); + if (!xattrs) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "dict_new failed"); + goto err; + } + + ret = dict_set_uint32(xattrs, conf->link_xattr_name, 256); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value: key = %s", + conf->link_xattr_name); + if (xattrs) + dict_unref(xattrs); + goto err; + } + STACK_WIND_COOKIE(frame, dht_rmdir_lookup_cbk, src, src, src->fops->lookup, + &local->loc, xattrs); + if (xattrs) + dict_unref(xattrs); + + return 0; +err: + this_call_cnt = dht_frame_return(readdirp_frame); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (this->private, err); + /* Once all the lookups/unlinks etc have returned, proceed to wind + * readdirp on the subvol again until no entries are returned. + * This is required if there are more entries than can be returned + * in a single readdirp call. + */ - conf = this->private; + if (is_last_call(this_call_cnt)) + dht_rmdir_readdirp_do(readdirp_frame, this); - local = dht_local_init (frame); - if (!local) { + DHT_STACK_DESTROY(frame); + return 0; +} - op_errno = ENOMEM; - goto err; - } +static int +dht_rmdir_is_subvol_empty(call_frame_t *frame, xlator_t *this, + gf_dirent_t *entries, xlator_t *src) +{ + int ret = 0; + int build_ret = 0; + gf_dirent_t *trav = NULL; + call_frame_t *lookup_frame = NULL; + dht_local_t *lookup_local = NULL; + dht_local_t *local = NULL; + dict_t *xattrs = NULL; + dht_conf_t *conf = this->private; + xlator_t *subvol = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + int count = 0; + gf_boolean_t unwind = _gf_false; + + local = frame->local; + + list_for_each_entry(trav, &entries->list, list) + { + if (strcmp(trav->d_name, ".") == 0) + continue; + if (strcmp(trav->d_name, "..") == 0) + continue; + if (check_is_linkfile(NULL, (&trav->d_stat), trav->dict, + conf->link_xattr_name)) { + count++; + continue; + } + + /* this entry is either a directory which is neither "." nor "..", + or a non directory which is not a linkfile. the directory is to + be treated as non-empty + */ + return 0; + } + + xattrs = dict_new(); + if (!xattrs) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "dict_new failed"); + return -1; + } - local->call_cnt = conf->subvolume_cnt; - local->op_ret = 0; + ret = dict_set_uint32(xattrs, conf->link_xattr_name, 256); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value: key = %s", + conf->link_xattr_name); - ret = loc_copy (&local->loc, loc); - if (ret == -1) { + if (xattrs) + dict_unref(xattrs); + return -1; + } - op_errno = ENOMEM; - goto err; - } + local->call_cnt = count; + ret = 0; - local->flags = flags; + list_for_each_entry(trav, &entries->list, list) + { + if (strcmp(trav->d_name, ".") == 0) + continue; + if (strcmp(trav->d_name, "..") == 0) + continue; - local->fd = fd_create (local->loc.inode, frame->root->pid); - if (!local->fd) { + lookup_frame = copy_frame(frame); - op_errno = ENOMEM; - goto err; + if (!lookup_frame) { + /* out of memory, let the rmdir fail + (as non-empty, unfortunately) */ + goto err; } - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_rmdir_opendir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->opendir, - loc, local->fd); + lookup_local = dht_local_init(lookup_frame, NULL, NULL, GF_FOP_LOOKUP); + if (!lookup_local) { + goto err; } - return 0; + lookup_frame->local = lookup_local; + lookup_local->main_frame = frame; + lookup_local->hashed_subvol = src; -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (rmdir, frame, -1, op_errno, - NULL, NULL); + build_ret = dht_build_child_loc(this, &lookup_local->loc, &local->loc, + trav->d_name); + if (build_ret != 0) + goto err; - return 0; -} + gf_uuid_copy(lookup_local->loc.gfid, trav->d_stat.ia_gfid); + gf_uuid_unparse(lookup_local->loc.gfid, gfid); -int -dht_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) -{ - DHT_STACK_UNWIND (xattrop, frame, op_ret, op_errno, dict); - return 0; -} + gf_msg_trace(this->name, 0, "looking up %s on subvolume %s, gfid = %s", + lookup_local->loc.path, src->name, gfid); + subvol = dht_linkfile_subvol(this, NULL, &trav->d_stat, trav->dict); + if (!subvol || (subvol == src)) { + /* we need to delete the linkto file if it does not have a + * valid subvol or it points to itself. + */ + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_INVALID_LINKFILE, + "Linkfile does not have link subvolume. " + "path = %s, gfid = %s", + lookup_local->loc.path, gfid); -int -dht_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t flags, dict_t *dict) -{ - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + gf_msg_debug(this->name, 0, "looking up %s on subvol %s, gfid = %s", + lookup_local->loc.path, src->name, gfid); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + STACK_WIND_COOKIE(lookup_frame, dht_rmdir_lookup_cbk, src, src, + src->fops->lookup, &lookup_local->loc, xattrs); + } else { + gf_msg_debug(this->name, 0, + "Looking up linkfile target %s on " + " subvol %s, gfid = %s", + lookup_local->loc.path, subvol->name, gfid); - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; + STACK_WIND(lookup_frame, dht_rmdir_cached_lookup_cbk, subvol, + subvol->fops->lookup, &lookup_local->loc, xattrs); } + ret++; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; + lookup_frame = NULL; + lookup_local = NULL; + } - goto err; - } + if (xattrs) + dict_unref(xattrs); - local->inode = inode_ref (loc->inode); - local->call_cnt = 1; - - STACK_WIND (frame, - dht_xattrop_cbk, - subvol, subvol->fops->xattrop, - loc, flags, dict); + return ret; +err: + if (xattrs) + dict_unref(xattrs); - return 0; + if (lookup_frame) + DHT_STACK_DESTROY(lookup_frame); -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL); + /* Handle the case where the wound calls have unwound before the + * loop processing is done + */ - return 0; -} + LOCK(&frame->lock); + { + local->op_ret = -1; + local->op_errno = ENOTEMPTY; + local->call_cnt -= (count - ret); + if (!local->call_cnt) + unwind = _gf_true; + } + UNLOCK(&frame->lock); -int -dht_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) -{ - DHT_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, dict); - return 0; + if (!unwind) { + return ret; + } + return 0; } +/* + * No more entries on this subvol. Proceed to the actual rmdir operation. + */ -int -dht_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict) +static int +dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + xlator_t *src = NULL; + int ret = 0; + char *path = NULL; + + local = frame->local; + prev = cookie; + src = prev; + + if (op_ret > 2) { + /* dht_rmdir_is_subvol_empty() may free the frame, + * copy path for logging. + */ + path = gf_strdup(local->loc.path); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + ret = dht_rmdir_is_subvol_empty(frame, this, entries, src); - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; + switch (ret) { + case 0: /* non linkfiles exist */ + gf_msg_trace(this->name, 0, + "readdir on %s for %s returned %d " + "entries", + prev->name, local->loc.path, op_ret); + local->op_ret = -1; + local->op_errno = ENOTEMPTY; + break; + default: + /* @ret number of linkfiles are getting unlinked */ + gf_msg_trace(this->name, 0, + "readdir on %s for %s found %d " + "linkfiles", + prev->name, path, ret); + break; } + } - STACK_WIND (frame, - dht_fxattrop_cbk, - subvol, subvol->fops->fxattrop, - fd, flags, dict); + /* readdirp failed or no linkto files were found on this subvol */ + if (!ret) + dht_rmdir_readdirp_done(frame, this); - return 0; + GF_FREE(path); + return 0; +} + +static int +dht_rmdir_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + xlator_t *prev = NULL; + int ret = 0; + dht_conf_t *conf = this->private; + dict_t *dict = NULL; + int i = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + dht_local_t *readdirp_local = NULL; + call_frame_t *readdirp_frame = NULL; + int cnt = 0; + + local = frame->local; + prev = cookie; + + this_call_cnt = dht_frame_return(frame); + if (op_ret == -1) { + gf_uuid_unparse(local->loc.gfid, gfid); + + gf_msg_debug(this->name, op_errno, + "opendir on %s for %s failed, " + "gfid = %s,", + prev->name, local->loc.path, gfid); + if ((op_errno != ENOENT) && (op_errno != ESTALE)) { + local->op_ret = -1; + local->op_errno = op_errno; + } + goto err; + } + + if (!is_last_call(this_call_cnt)) + return 0; + + if (local->op_ret == -1) + goto err; + + fd_bind(fd); + + dict = dict_new(); + if (!dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + + ret = dict_set_uint32(dict, conf->link_xattr_name, 256); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "%s: Failed to set dictionary value:key = %s", local->loc.path, + conf->link_xattr_name); + + cnt = local->call_cnt = conf->subvolume_cnt; + + /* Create a separate frame per subvol as we might need + * to resend readdirp multiple times to get all the + * entries. + */ + + for (i = 0; i < conf->subvolume_cnt; i++) { + readdirp_frame = copy_frame(frame); + + if (!readdirp_frame) { + cnt--; + /* Reduce the local->call_cnt as well */ + (void)dht_frame_return(frame); + continue; + } + + readdirp_local = dht_local_init(readdirp_frame, &local->loc, local->fd, + 0); + + if (!readdirp_local) { + DHT_STACK_DESTROY(readdirp_frame); + cnt--; + /* Reduce the local->call_cnt as well */ + dht_frame_return(frame); + continue; + } + readdirp_local->main_frame = frame; + readdirp_local->op_ret = 0; + readdirp_local->xattr = dict_ref(dict); + /* overload this field to save the subvol info */ + readdirp_local->hashed_subvol = conf->subvolumes[i]; + + STACK_WIND_COOKIE(readdirp_frame, dht_rmdir_readdirp_cbk, + conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->readdirp, + readdirp_local->fd, 4096, 0, readdirp_local->xattr); + } + + if (dict) + dict_unref(dict); + + /* Could not wind readdirp to any subvol */ + + if (!cnt) + goto err; + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL); + if (is_last_call(this_call_cnt)) { + dht_rmdir_do(frame, this); + } - return 0; + return 0; } - int -dht_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - +dht_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { - DHT_STACK_UNWIND (inodelk, frame, op_ret, op_errno); - return 0; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; + int ret = -1; + dict_t *xattr_req = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + local = dht_local_init(frame, loc, NULL, GF_FOP_RMDIR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->call_cnt = conf->subvolume_cnt; + local->op_ret = 0; + local->fop_succeeded = 0; + + local->flags = flags; + + local->fd = fd_create(local->loc.inode, frame->root->pid); + if (!local->fd) { + op_errno = ENOMEM; + goto err; + } + + if (flags) { + return dht_rmdir_do(frame, this); + } + if (xdata) { + xattr_req = dict_ref(xdata); + } else { + xattr_req = dict_new(); + } + if (xattr_req) { + ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256); + /* If parallel-readdir is enabled, this is required + * to handle stale linkto files in the directory + * being deleted. If this fails, log an error but + * do not prevent the operation. + */ + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "%s: failed to set key %s", + loc->path, conf->link_xattr_name); + } + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "%s: failed to set key %s", + loc->path, conf->link_xattr_name); + } + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_rmdir_opendir_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->opendir, loc, local->fd, + xattr_req); + } + + if (xattr_req) { + dict_unref(xattr_req); + } + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(rmdir, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; } +static int +dht_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) -int32_t -dht_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *lock) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + DHT_STACK_UNWIND(entrylk, frame, op_ret, op_errno, xdata); + return 0; +} +/* TODO + * Sending entrylk to cached subvol can result in stale lock + * as described in the bug 1311002. + */ +int +dht_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + local = dht_local_init(frame, loc, NULL, GF_FOP_ENTRYLK); + if (!local) { + op_errno = ENOMEM; + goto err; + } - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; + subvol = local->cached_subvol; + if (!subvol) { + gf_uuid_unparse(loc->gfid, gfid); - goto err; - } + gf_msg_debug(this->name, 0, + "no cached subvolume for path=%s, " + "gfid = %s", + loc->path, gfid); + op_errno = EINVAL; + goto err; + } - local->inode = inode_ref (loc->inode); - local->call_cnt = 1; + local->call_cnt = 1; - STACK_WIND (frame, - dht_inodelk_cbk, - subvol, subvol->fops->inodelk, - volume, loc, cmd, lock); + STACK_WIND(frame, dht_entrylk_cbk, subvol, subvol->fops->entrylk, volume, + loc, basename, cmd, type, xdata); - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (inodelk, frame, -1, op_errno); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(entrylk, frame, -1, op_errno, NULL); - return 0; + return 0; } - -int -dht_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +static int +dht_fentrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - DHT_STACK_UNWIND (finodelk, frame, op_ret, op_errno); - return 0; + DHT_STACK_UNWIND(fentrylk, frame, op_ret, op_errno, NULL); + return 0; } - int -dht_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *lock) +dht_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; + xlator_t *subvol = NULL; + int op_errno = -1; + char gfid[GF_UUID_BUF_SIZE] = {0}; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + VALIDATE_OR_GOTO(fd->inode, err); - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + gf_uuid_unparse(fd->inode->gfid, gfid); + subvol = dht_subvol_get_cached(this, fd->inode); + if (!subvol) { + gf_msg_debug(this->name, 0, + "No cached subvolume for fd=%p," + " gfid = %s", + fd, gfid); + op_errno = EINVAL; + goto err; + } - STACK_WIND (frame, - dht_finodelk_cbk, - subvol, subvol->fops->finodelk, - volume, fd, cmd, lock); + STACK_WIND(frame, dht_fentrylk_cbk, subvol, subvol->fops->fentrylk, volume, + fd, basename, cmd, type, xdata); - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (finodelk, frame, -1, op_errno); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fentrylk, frame, -1, op_errno, NULL); - return 0; + return 0; } +static int32_t +dht_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; -int -dht_entrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); -{ - DHT_STACK_UNWIND (entrylk, frame, op_ret, op_errno); - return 0; -} + local = frame->local; + LOCK(&frame->lock); + { + if (op_ret < 0 && op_errno != ENOTCONN) { + local->op_errno = op_errno; + goto unlock; + } + local->op_ret = 0; + } +unlock: + UNLOCK(&frame->lock); -int -dht_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + DHT_STACK_UNWIND(ipc, frame, local->op_ret, local->op_errno, NULL); + } + +out: + return 0; +} + +int32_t +dht_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + dht_local_t *local = NULL; + int op_errno = EINVAL; + dht_conf_t *conf = NULL; + int call_cnt = 0; + int i = 0; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + if (op != GF_IPC_TARGET_UPCALL) + goto wind_default; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; + VALIDATE_OR_GOTO(this->private, err); + conf = this->private; - goto err; - } + local = dht_local_init(frame, NULL, NULL, GF_FOP_IPC); + if (!local) { + op_errno = ENOMEM; + goto err; + } - local->inode = inode_ref (loc->inode); - local->call_cnt = 1; + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; - STACK_WIND (frame, dht_entrylk_cbk, - subvol, subvol->fops->entrylk, - volume, loc, basename, cmd, type); + if (xdata) { + if (dict_set_int8(xdata, conf->xattr_name, 0) < 0) + goto err; + } - return 0; + for (i = 0; i < call_cnt; i++) { + STACK_WIND(frame, dht_ipc_cbk, conf->subvolumes[i], + conf->subvolumes[i]->fops->ipc, op, xdata); + } + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (entrylk, frame, -1, op_errno); + DHT_STACK_UNWIND(ipc, frame, -1, op_errno, NULL); - return 0; -} + return 0; +wind_default: + STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ipc, op, xdata); + return 0; +} int -dht_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - +dht_forget(xlator_t *this, inode_t *inode) { - DHT_STACK_UNWIND (fentrylk, frame, op_ret, op_errno); + uint64_t ctx_int = 0; + dht_inode_ctx_t *ctx = NULL; + dht_layout_t *layout = NULL; + + inode_ctx_del(inode, this, &ctx_int); + + if (!ctx_int) return 0; -} + ctx = (dht_inode_ctx_t *)(long)ctx_int; + + layout = ctx->layout; + ctx->layout = NULL; + dht_layout_unref(this, layout); + GF_FREE(ctx); + + return 0; +} int -dht_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) +dht_notify(xlator_t *this, int event, void *data, ...) { - xlator_t *subvol = NULL; - int op_errno = -1; + xlator_t *subvol = NULL; + int cnt = -1; + int i = -1; + dht_conf_t *conf = NULL; + int ret = -1; + int propagate = 0; + + int had_heard_from_all = 0; + int have_heard_from_all = 0; + gf_defrag_info_t *defrag = NULL; + dict_t *dict = NULL; + gf_defrag_type cmd = 0; + dict_t *output = NULL; + va_list ap; + struct gf_upcall *up_data = NULL; + struct gf_upcall_cache_invalidation *up_ci = NULL; + + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, out); + + /* had all subvolumes reported status once till now? */ + had_heard_from_all = 1; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->last_event[i]) { + had_heard_from_all = 0; + } + } + + switch (event) { + case GF_EVENT_CHILD_UP: + subvol = data; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + conf->gen++; - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + cnt = i; + break; + } + } - STACK_WIND (frame, dht_fentrylk_cbk, - subvol, subvol->fops->fentrylk, - volume, fd, basename, cmd, type); + if (cnt == -1) { + gf_msg_debug(this->name, 0, + "got GF_EVENT_CHILD_UP bad " + "subvolume %s", + subvol->name); + break; + } - return 0; + LOCK(&conf->subvolume_lock); + { + conf->subvolume_status[cnt] = 1; + conf->last_event[cnt] = event; + conf->subvol_up_time[cnt] = gf_time(); + } + UNLOCK(&conf->subvolume_lock); -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fentrylk, frame, -1, op_errno); + /* one of the node came back up, do a stat update */ + dht_get_du_info_for_subvol(this, cnt); - return 0; -} + break; + case GF_EVENT_SOME_DESCENDENT_UP: + subvol = data; + conf->gen++; + propagate = 1; -int -dht_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *statpre, - struct iatt *statpost) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; + break; + case GF_EVENT_SOME_DESCENDENT_DOWN: + subvol = data; + propagate = 1; - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + break; - dht_iatt_merge (this, &local->prebuf, statpre, prev->this); - dht_iatt_merge (this, &local->stbuf, statpost, prev->this); + case GF_EVENT_CHILD_DOWN: + subvol = data; - if (local->inode) { - local->prebuf.ia_ino = local->inode->ino; - local->stbuf.ia_ino = local->inode->ino; + if (conf->assert_no_child_down) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_CHILD_DOWN, + "Received CHILD_DOWN. Exiting"); + if (conf->defrag) { + gf_defrag_stop(conf, GF_DEFRAG_STATUS_FAILED, NULL); + } else { + kill(getpid(), SIGTERM); } + } - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + cnt = i; + break; + } + } - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (setattr, frame, local->op_ret, local->op_errno, - &local->prebuf, &local->stbuf); + if (cnt == -1) { + gf_msg_debug(this->name, 0, + "got GF_EVENT_CHILD_DOWN bad " + "subvolume %s", + subvol->name); + break; + } - return 0; -} + LOCK(&conf->subvolume_lock); + { + conf->subvolume_status[cnt] = 0; + conf->last_event[cnt] = event; + conf->subvol_up_time[cnt] = 0; + } + UNLOCK(&conf->subvolume_lock); + for (i = 0; i < conf->subvolume_cnt; i++) + if (conf->last_event[i] != event) + event = GF_EVENT_SOME_DESCENDENT_DOWN; + break; -int -dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid) -{ - dht_layout_t *layout = NULL; - dht_local_t *local = NULL; - int op_errno = -1; - int i = -1; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_DEBUG, - "memory allocation failed :("); - goto err; - } + case GF_EVENT_CHILD_CONNECTING: + subvol = data; - local->layout = layout = dht_layout_get (this, loc->inode); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + cnt = i; + break; + } + } - if (!layout_is_sane (layout)) { - gf_log (this->name, GF_LOG_DEBUG, - "layout is not sane for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + if (cnt == -1) { + gf_msg_debug(this->name, 0, + "got GF_EVENT_CHILD_CONNECTING" + " bad subvolume %s", + subvol->name); + break; + } - local->inode = inode_ref (loc->inode); - local->call_cnt = layout->cnt; + LOCK(&conf->subvolume_lock); + { + conf->last_event[cnt] = event; + } + UNLOCK(&conf->subvolume_lock); - for (i = 0; i < layout->cnt; i++) { - STACK_WIND (frame, dht_setattr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->setattr, - loc, stbuf, valid); - } + break; + case GF_EVENT_VOLUME_DEFRAG: { + if (!conf->defrag) { + return ret; + } + defrag = conf->defrag; - return 0; + dict = data; + va_start(ap, data); + output = va_arg(ap, dict_t *); -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL); + ret = dict_get_int32(dict, "rebalance-command", (int32_t *)&cmd); + if (ret) { + va_end(ap); + return ret; + } + LOCK(&defrag->lock); + { + if (defrag->is_exiting) + goto unlock; + if ((cmd == GF_DEFRAG_CMD_STATUS) || + (cmd == GF_DEFRAG_CMD_DETACH_STATUS)) + gf_defrag_status_get(conf, output); + else if (cmd == GF_DEFRAG_CMD_DETACH_START) + defrag->cmd = GF_DEFRAG_CMD_DETACH_START; + else if (cmd == GF_DEFRAG_CMD_STOP || + cmd == GF_DEFRAG_CMD_DETACH_STOP) + gf_defrag_stop(conf, GF_DEFRAG_STATUS_STOPPED, output); + } + unlock: + UNLOCK(&defrag->lock); + va_end(ap); + return ret; + break; + } + case GF_EVENT_UPCALL: + up_data = (struct gf_upcall *)data; + if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION) + break; + up_ci = (struct gf_upcall_cache_invalidation *)up_data->data; + + /* Since md-cache will be aggressively filtering lookups, + * the stale layout issue will be more pronounced. Hence + * when a layout xattr is changed by the rebalance process + * notify all the md-cache clients to invalidate the existing + * stat cache and send the lookup next time*/ + if (up_ci->dict && dict_get(up_ci->dict, conf->xattr_name)) + up_ci->flags |= UP_EXPLICIT_LOOKUP; + + /* TODO: Instead of invalidating iatt, update the new + * hashed/cached subvolume in dht inode_ctx */ + if (IS_DHT_LINKFILE_MODE(&up_ci->stat)) + up_ci->flags |= UP_EXPLICIT_LOOKUP; + + propagate = 1; + break; + default: + propagate = 1; + break; + } + + /* have all subvolumes reported status once by now? */ + have_heard_from_all = 1; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->last_event[i]) + have_heard_from_all = 0; + } + + /* if all subvols have reported status, no need to hide anything + or wait for anything else. Just propagate blindly */ + if (have_heard_from_all) { + propagate = 1; + } + + if (!had_heard_from_all && have_heard_from_all) { + static int run_defrag = 0; + /* This is the first event which completes aggregation + of events from all subvolumes. If at least one subvol + had come up, propagate CHILD_UP, but only this time + */ + event = GF_EVENT_CHILD_DOWN; - return 0; -} + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->last_event[i] == GF_EVENT_CHILD_UP) { + event = GF_EVENT_CHILD_UP; + break; + } + if (conf->last_event[i] == GF_EVENT_CHILD_CONNECTING) { + event = GF_EVENT_CHILD_CONNECTING; + /* continue to check other events for CHILD_UP */ + } + } + + /* Rebalance is started with assert_no_child_down. So we do + * not need to handle CHILD_DOWN event here. + * + * If there is a graph switch, we should not restart the + * rebalance daemon. Use 'run_defrag' to indicate if the + * thread has already started. + */ + if (conf->defrag && !run_defrag) { + run_defrag = 1; + ret = gf_thread_create(&conf->defrag->th, NULL, gf_defrag_start, + this, "dhtdg"); + if (ret) { + GF_FREE(conf->defrag); + conf->defrag = NULL; + kill(getpid(), SIGTERM); + } + } + } + + ret = 0; + if (propagate) + ret = default_notify(this, event, data); +out: + return ret; +} int -dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, - int32_t valid) +dht_inode_ctx_layout_get(inode_t *inode, xlator_t *this, dht_layout_t **layout) { - dht_layout_t *layout = NULL; - dht_local_t *local = NULL; - int op_errno = -1; - int i = -1; - + dht_inode_ctx_t *ctx = NULL; + int ret = -1; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - - goto err; - } + ret = dht_inode_ctx_get(inode, this, &ctx); - local->layout = layout = dht_layout_get (this, fd->inode); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for fd=%p", fd); - op_errno = EINVAL; - goto err; + if (!ret && ctx) { + if (ctx->layout) { + if (layout) + *layout = ctx->layout; + ret = 0; + } else { + ret = -1; } + } - if (!layout_is_sane (layout)) { - gf_log (this->name, GF_LOG_DEBUG, - "layout is not sane for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + return ret; +} - local->inode = inode_ref (fd->inode); - local->call_cnt = layout->cnt; +void +dht_log_new_layout_for_dir_selfheal(xlator_t *this, loc_t *loc, + dht_layout_t *layout) +{ + char string[2048] = {0}; + char *output_string = NULL; + int len = 0; + int off = 0; + int i = 0; + gf_loglevel_t log_level = gf_log_get_loglevel(); + int ret = 0; + + if (log_level < GF_LOG_INFO) + return; - for (i = 0; i < layout->cnt; i++) { - STACK_WIND (frame, dht_setattr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->fsetattr, - fd, stbuf, valid); - } + if (!layout) + return; - return 0; + if (!layout->cnt) + return; -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL); + if (!loc) + return; - return 0; -} + if (!loc->path) + return; + ret = snprintf(string, sizeof(string), "Setting layout of %s with ", + loc->path); -int -dht_forget (xlator_t *this, inode_t *inode) -{ - uint64_t tmp_layout = 0; - dht_layout_t *layout = NULL; + if (ret < 0) + return; - inode_ctx_get (inode, this, &tmp_layout); + len += ret; - if (!tmp_layout) - return 0; + /* Calculation of total length of the string required to calloc + * output_string. Log includes subvolume-name, start-range, end-range + * and err value. + * + * This log will help to debug cases where: + * a) Different processes set different layout of a directory. + * b) Error captured in lookup, which will be filled in layout->err + * (like ENOENT, ESTALE etc) + */ - layout = (dht_layout_t *)(long)tmp_layout; - dht_layout_unref (this, layout); + for (i = 0; i < layout->cnt; i++) { + ret = snprintf(string, sizeof(string), + "[Subvol_name: %s, Err: %d , Start: " + "0x%x, Stop: 0x%x, Hash: 0x%x], ", + layout->list[i].xlator->name, layout->list[i].err, + layout->list[i].start, layout->list[i].stop, + layout->list[i].commit_hash); - return 0; -} + if (ret < 0) + return; + len += ret; + } + len++; -int -dht_init_subvolumes (xlator_t *this, dht_conf_t *conf) -{ - xlator_list_t *subvols = NULL; - int cnt = 0; + output_string = GF_MALLOC(len + 1, gf_common_mt_char); - if (!conf) - return -1; + if (!output_string) + return; - for (subvols = this->children; subvols; subvols = subvols->next) - cnt++; + ret = snprintf(output_string, len + 1, "Setting layout of %s with ", + loc->path); - conf->subvolumes = GF_CALLOC (cnt, sizeof (xlator_t *), - gf_dht_mt_xlator_t); - if (!conf->subvolumes) { + if (ret < 0) + goto err; - return -1; - } - conf->subvolume_cnt = cnt; + off += ret; - cnt = 0; - for (subvols = this->children; subvols; subvols = subvols->next) - conf->subvolumes[cnt++] = subvols->xlator; + for (i = 0; i < layout->cnt; i++) { + ret = snprintf(output_string + off, len - off, + "[Subvol_name: %s, Err: %d , Start: " + "0x%x, Stop: 0x%x, Hash: 0x%x], ", + layout->list[i].xlator->name, layout->list[i].err, + layout->list[i].start, layout->list[i].stop, + layout->list[i].commit_hash); - conf->subvolume_status = GF_CALLOC (cnt, sizeof (char), - gf_dht_mt_char); - if (!conf->subvolume_status) { + if (ret < 0) + goto err; - return -1; - } + off += ret; + } - conf->last_event = GF_CALLOC (cnt, sizeof (int), - gf_dht_mt_char); - if (!conf->last_event) { + gf_msg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_LOG_FIXED_LAYOUT, "%s", + output_string); - return -1; - } - return 0; +err: + GF_FREE(output_string); } - -int -dht_notify (xlator_t *this, int event, void *data, ...) +int32_t +dht_migration_get_dst_subvol(xlator_t *this, dht_local_t *local) { - xlator_t *subvol = NULL; - int cnt = -1; - int i = -1; - dht_conf_t *conf = NULL; - int ret = -1; - int propagate = 0; + int ret = -1; - int had_heard_from_all = 0; - int have_heard_from_all = 0; + if (!local) + goto out; + local->rebalance.target_node = dht_subvol_get_hashed(this, &local->loc); - conf = this->private; - if (!conf) - return ret; + if (local->rebalance.target_node) + ret = 0; - /* had all subvolumes reported status once till now? */ - had_heard_from_all = 1; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (!conf->last_event[i]) { - had_heard_from_all = 0; - } - } +out: + return ret; +} - switch (event) { - case GF_EVENT_CHILD_UP: - subvol = data; +/* +This function should not be called more then once during a FOP +handling path. It is valid only for for ops on files +*/ +int32_t +dht_set_local_rebalance(xlator_t *this, dht_local_t *local, struct iatt *stbuf, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + if (!local) + return -1; - conf->gen++; + if (local->rebalance.set) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_REBAL_STRUCT_SET, + "local->rebalance already set"); + } - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - cnt = i; - break; - } - } + if (stbuf) + memcpy(&local->rebalance.stbuf, stbuf, sizeof(struct iatt)); - if (cnt == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "got GF_EVENT_CHILD_UP bad subvolume %s", - subvol->name); - break; - } + if (prebuf) + memcpy(&local->rebalance.prebuf, prebuf, sizeof(struct iatt)); - LOCK (&conf->subvolume_lock); - { - conf->subvolume_status[cnt] = 1; - conf->last_event[cnt] = event; - } - UNLOCK (&conf->subvolume_lock); + if (postbuf) + memcpy(&local->rebalance.postbuf, postbuf, sizeof(struct iatt)); - /* one of the node came back up, do a stat update */ - dht_get_du_info_for_subvol (this, cnt); + if (xdata) + local->rebalance.xdata = dict_ref(xdata); - break; + local->rebalance.set = 1; - case GF_EVENT_CHILD_MODIFIED: - subvol = data; + return 0; +} - conf->gen++; +int32_t +dht_release(xlator_t *this, fd_t *fd) +{ + return dht_fd_ctx_destroy(this, fd); +} - break; +static int +dht_pt_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; - case GF_EVENT_CHILD_DOWN: - subvol = data; + local = frame->local; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - cnt = i; - break; - } - } + if (!op_ret) { + dht_layout_set(this, inode, local->layout); + } - if (cnt == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "got GF_EVENT_CHILD_DOWN bad subvolume %s", - subvol->name); - break; - } + DHT_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, NULL); - LOCK (&conf->subvolume_lock); - { - conf->subvolume_status[cnt] = 0; - conf->last_event[cnt] = event; - } - UNLOCK (&conf->subvolume_lock); + return 0; +} - break; +int32_t +dht_pt_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + bool free_xdata = false; + int ret = 0; + int op_errno = 0; + int32_t *disk_layout_p = NULL; + + conf = this->private; + + local = dht_local_init(frame, loc, NULL, GF_FOP_MKDIR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = dht_layout_new(this, conf->subvolume_cnt); + if (!layout) + goto wind; + + local->layout = layout; + + if (!xdata) { + xdata = dict_new(); + if (!xdata) + goto wind; + free_xdata = true; + } + + /*Set the xlator or the following will crash*/ + layout->list[0].xlator = conf->subvolumes[0]; + + dht_selfheal_layout_new_directory(frame, loc, layout); + + dht_disk_layout_extract(this, layout, 0, &disk_layout_p); + + ret = dict_set_bin(xdata, conf->xattr_name, disk_layout_p, 4 * 4); + if (ret) { + gf_msg("dht", GF_LOG_DEBUG, EINVAL, DHT_MSG_DICT_SET_FAILED, + "dht layout dict set failed"); + } +wind: + STACK_WIND(frame, dht_pt_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); + if (free_xdata) + dict_unref(xdata); + return 0; - case GF_EVENT_CHILD_CONNECTING: - subvol = data; +err: + op_errno = local ? local->op_errno : op_errno; + DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - cnt = i; - break; - } - } + return 0; +} - if (cnt == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "got GF_EVENT_CHILD_CONNECTING bad subvolume %s", - subvol->name); - break; - } +static int +dht_pt_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + dht_conf_t *conf = NULL; - LOCK (&conf->subvolume_lock); - { - conf->last_event[cnt] = event; - } - UNLOCK (&conf->subvolume_lock); + conf = this->private; + dict_del(xattr, conf->xattr_name); + dict_del(xattr, conf->mds_xattr_key); + dict_del(xattr, conf->commithash_xattr_name); - break; - default: - propagate = 1; - break; - } + if (frame->root->pid >= 0) { + GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr); + GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr); + } + DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata); + return 0; +} - /* have all subvolumes reported status once by now? */ - have_heard_from_all = 1; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (!conf->last_event[i]) - have_heard_from_all = 0; - } +int +dht_pt_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *xdata) +{ + STACK_WIND(frame, dht_pt_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, key, xdata); + return 0; +} + +static int +dht_pt_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + dht_conf_t *conf = NULL; - /* if all subvols have reported status, no need to hide anything - or wait for anything else. Just propagate blindly */ - if (have_heard_from_all) - propagate = 1; + conf = this->private; + dict_del(xattr, conf->xattr_name); - if (!had_heard_from_all && have_heard_from_all) { - /* This is the first event which completes aggregation - of events from all subvolumes. If at least one subvol - had come up, propagate CHILD_UP, but only this time - */ - event = GF_EVENT_CHILD_DOWN; + if (frame->root->pid >= 0) { + GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr); + GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr); + } - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->last_event[i] == GF_EVENT_CHILD_UP) { - event = GF_EVENT_CHILD_UP; - break; - } + DHT_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, xattr, xdata); + return 0; +} - if (conf->last_event[i] == GF_EVENT_CHILD_CONNECTING) { - event = GF_EVENT_CHILD_CONNECTING; - /* continue to check other events for CHILD_UP */ - } - } - } +int +dht_pt_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, + dict_t *xdata) +{ + STACK_WIND(frame, dht_pt_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, key, xdata); + return 0; +} + +/* The job of this function is to check if all the xlators have updated + * error in the layout. */ +int +dht_dir_layout_error_check(xlator_t *this, inode_t *inode) +{ + dht_layout_t *layout = NULL; + int i = 0; - if (propagate || event == GF_EVENT_CHILD_MODIFIED) - ret = default_notify (this, event, data); + layout = dht_layout_get(this, inode); + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == 0) { + return 0; + } + } - return ret; + /* Returning the first xlator error as all xlators have errors */ + return layout->list[0].err; } diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 9c39d0d63b8..fe0dc3db34a 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -1,307 +1,1384 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif +#include <regex.h> #include "dht-mem-types.h" +#include "dht-messages.h" +#include <glusterfs/call-stub.h> #include "libxlator.h" +#include <glusterfs/syncop.h> +#include <glusterfs/refcount.h> +#include <glusterfs/timer.h> +#include "protocol-common.h" +#include <glusterfs/glusterfs-acl.h> #ifndef _DHT_H #define _DHT_H -#define GF_XATTR_FIX_LAYOUT_KEY "trusted.distribute.fix.layout" -#define GF_DHT_LOOKUP_UNHASHED_ON 1 +#define GF_XATTR_FIX_LAYOUT_KEY "distribute.fix.layout" +#define GF_XATTR_FILE_MIGRATE_KEY "trusted.distribute.migrate-data" +#define DHT_MDS_STR "mds" +#define GF_DHT_LOOKUP_UNHASHED_OFF 0 +#define GF_DHT_LOOKUP_UNHASHED_ON 1 #define GF_DHT_LOOKUP_UNHASHED_AUTO 2 +#define DHT_PATHINFO_HEADER "DISTRIBUTE:" +#define DHT_FILE_MIGRATE_DOMAIN "dht.file.migrate" +/* Layout synchronization */ +#define DHT_LAYOUT_HEAL_DOMAIN "dht.layout.heal" +/* Namespace synchronization */ +#define DHT_ENTRY_SYNC_DOMAIN "dht.entry.sync" +#define DHT_LAYOUT_HASH_INVALID 1 +#define MAX_REBAL_THREADS sysconf(_SC_NPROCESSORS_ONLN) + +#define DHT_DIR_STAT_BLOCKS 8 +#define DHT_DIR_STAT_SIZE 4096 + +/* Virtual xattr for subvols status */ + +#define DHT_SUBVOL_STATUS_KEY "dht.subvol.status" + +/* Virtual xattrs for debugging */ -#include <fnmatch.h> +#define DHT_DBG_HASHED_SUBVOL_PATTERN "dht.file.hashed-subvol.*" +#define DHT_DBG_HASHED_SUBVOL_KEY "dht.file.hashed-subvol." -typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno); +/* Rebalance nodeuuid flags */ +#define REBAL_NODEUUID_MINE 0x01 +typedef int (*dht_selfheal_dir_cbk_t)(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata); +typedef int (*dht_defrag_cbk_fn_t)(xlator_t *this, xlator_t *dst_node, + call_frame_t *frame, int ret); + +typedef int (*dht_refresh_layout_unlock)(call_frame_t *frame, xlator_t *this, + int op_ret, int invoke_cbk); + +typedef int (*dht_refresh_layout_done_handle)(call_frame_t *frame); struct dht_layout { - int cnt; - int preset; - int gen; - int type; - int ref; /* use with dht_conf_t->layout_lock */ - int search_unhashed; - struct { - int err; /* 0 = normal - -1 = dir exists and no xattr - >0 = dir lookup failed with errno - */ - uint32_t start; - uint32_t stop; - xlator_t *xlator; - } list[0]; + int spread_cnt; /* layout spread count per directory, + is controlled by 'setxattr()' with + special key */ + int cnt; + int preset; + /* + * The last *configuration* state for which this directory was known + * to be in balance. The corresponding vol_commit_hash changes + * whenever bricks are added or removed. This value changes when a + * (full) rebalance is complete. If they match, it's safe to assume + * that every file is where it should be and there's no need to do + * lookups for files elsewhere. If they don't, then we have to do a + * global lookup to be sure. + */ + uint32_t commit_hash; + /* + * The *runtime* state of the volume, changes when connections to + * bricks are made or lost. + */ + int gen; + int type; + gf_atomic_t ref; /* use with dht_conf_t->layout_lock */ + uint32_t search_unhashed; + struct { + int err; /* 0 = normal + -1 = dir exists and no xattr + >0 = dir lookup failed with errno + */ + uint32_t start; + uint32_t stop; + uint32_t commit_hash; + xlator_t *xlator; + } list[]; }; typedef struct dht_layout dht_layout_t; +struct dht_stat_time { + uint32_t atime; + uint32_t atime_nsec; + uint32_t ctime; + uint32_t ctime_nsec; + uint32_t mtime; + uint32_t mtime_nsec; +}; + +typedef struct dht_stat_time dht_stat_time_t; + +struct dht_inode_ctx { + dht_layout_t *layout; + dht_stat_time_t time; + xlator_t *lock_subvol; + xlator_t *mds_subvol; /* This is only used for directories */ +}; + +typedef struct dht_inode_ctx dht_inode_ctx_t; typedef enum { - DHT_HASH_TYPE_DM, + DHT_HASH_TYPE_DM, + DHT_HASH_TYPE_DM_USER, } dht_hashfn_type_t; +typedef enum { + DHT_INODELK, + DHT_ENTRYLK, +} dht_lock_type_t; + +/* rebalance related */ +struct dht_rebalance_ { + xlator_t *from_subvol; + xlator_t *target_node; + off_t offset; + size_t size; + int32_t flags; + int count; + struct iobref *iobref; + struct iovec *vector; + struct iatt stbuf; + struct iatt prebuf; + struct iatt postbuf; + dht_defrag_cbk_fn_t target_op_fn; + dict_t *xdata; + dict_t *xattr; + dict_t *dict; + struct gf_flock flock; + int32_t set; + int lock_cmd; +}; + +/** + * Enum to store decided action based on the qdstatfs (quota-deem-statfs) + * events + **/ +typedef enum { + qdstatfs_action_OFF = 0, + qdstatfs_action_REPLACE, + qdstatfs_action_NEGLECT, + qdstatfs_action_COMPARE, +} qdstatfs_action_t; + +typedef enum { + REACTION_INVALID, + FAIL_ON_ANY_ERROR, + IGNORE_ENOENT_ESTALE, + IGNORE_ENOENT_ESTALE_EIO, +} dht_reaction_type_t; + +struct dht_skip_linkto_unlink { + xlator_t *hash_links_to; + uuid_t cached_gfid; + uuid_t hashed_gfid; + int opend_fd_count; + gf_boolean_t handle_valid_link; +}; + +typedef struct { + xlator_t *xl; + loc_t loc; /* contains/points to inode to lock on. */ + char *domain; /* Only locks within a single domain + * contend with each other + */ + char *basename; /* Required for entrylk */ + gf_boolean_t locked; + dht_reaction_type_t do_on_failure; + short type; /* read/write lock. */ + gf_lkowner_t lk_owner; +} dht_lock_t; + +/* The lock structure represents inodelk. */ +typedef struct { + fop_inodelk_cbk_t inodelk_cbk; + dht_lock_t **locks; + int lk_count; + dht_reaction_type_t reaction; + + /* whether locking failed on _any_ of the "locks" above */ + int op_ret; + int op_errno; +} dht_ilock_wrap_t; + +/* The lock structure represents entrylk. */ +typedef struct { + fop_entrylk_cbk_t entrylk_cbk; + dht_lock_t **locks; + int lk_count; + dht_reaction_type_t reaction; + + /* whether locking failed on _any_ of the "locks" above */ + int op_ret; + int op_errno; +} dht_elock_wrap_t; + +/* The first member of dht_dir_transaction_t should be of type dht_ilock_wrap_t. + * Otherwise it can result in subtle memory corruption issues as in most of the + * places we use lock[0].layout.my_layout or lock[0].layout.parent_layout and + * lock[0].ns.parent_layout (like in dht_local_wipe). + */ +typedef union { + union { + dht_ilock_wrap_t my_layout; + dht_ilock_wrap_t parent_layout; + } layout; + struct dht_namespace { + dht_ilock_wrap_t parent_layout; + dht_elock_wrap_t directory_ns; + fop_entrylk_cbk_t ns_cbk; + } ns; +} dht_dir_transaction_t; + +typedef int (*dht_selfheal_layout_t)(call_frame_t *frame, loc_t *loc, + dht_layout_t *layout); + +typedef gf_boolean_t (*dht_need_heal_t)(call_frame_t *frame, + dht_layout_t **inmem, + dht_layout_t **ondisk); struct dht_local { - int call_cnt; - loc_t loc; - loc_t loc2; - int op_ret; - int op_errno; - int layout_mismatch; - /* Use stbuf as the postbuf, when we require both - * pre and post attrs */ - struct iatt stbuf; - struct iatt prebuf; - struct iatt preoldparent; - struct iatt postoldparent; - struct iatt preparent; - struct iatt postparent; - struct statvfs statvfs; - fd_t *fd; - inode_t *inode; - dict_t *params; - dict_t *xattr; - dict_t *xattr_req; - dht_layout_t *layout; - size_t size; - ino_t ia_ino; - xlator_t *src_hashed, *src_cached; - xlator_t *dst_hashed, *dst_cached; - xlator_t *cached_subvol; - xlator_t *hashed_subvol; - char need_selfheal; - int file_count; - int dir_count; - call_frame_t *main_frame; - struct { - fop_mknod_cbk_t linkfile_cbk; - struct iatt stbuf; - loc_t loc; - inode_t *inode; - dict_t *xattr; - xlator_t *srcvol; - } linkfile; - struct { - uint32_t hole_cnt; - uint32_t overlaps_cnt; - uint32_t missing; - uint32_t down; - uint32_t misc; - dht_selfheal_dir_cbk_t dir_cbk; - dht_layout_t *layout; - } selfheal; - uint32_t uid; - uint32_t gid; - - /* needed by nufa */ - int32_t flags; - mode_t mode; - dev_t rdev; - - /* need for file-info */ - char *pathinfo; - char *key; - - char *newpath; - - /* gfid related */ - uuid_t gfid; - - /*Marker Related*/ - struct marker_str marker; + loc_t loc; + loc_t loc2; + int call_cnt; + int op_ret; + int op_errno; + int layout_mismatch; + /* Use stbuf as the postbuf, when we require both + * pre and post attrs */ + struct iatt stbuf; + struct iatt mds_stbuf; + struct iatt prebuf; + struct iatt preoldparent; + struct iatt postoldparent; + struct iatt preparent; + struct iatt postparent; + struct statvfs statvfs; + fd_t *fd; + inode_t *inode; + dict_t *params; + dict_t *xattr; + dict_t *mds_xattr; + dict_t *xdata; /* dict used to save xdata response by xattr fop */ + dict_t *xattr_req; + dht_layout_t *layout; + size_t size; + ino_t ia_ino; + xlator_t *src_hashed, *src_cached; + xlator_t *dst_hashed, *dst_cached; + xlator_t *cached_subvol; + xlator_t *hashed_subvol; + xlator_t *mds_subvol; /* This is use for dir only */ + int file_count; + int dir_count; + call_frame_t *main_frame; + int fop_succeeded; + struct { + fop_mknod_cbk_t linkfile_cbk; + struct iatt stbuf; + loc_t loc; + inode_t *inode; + dict_t *xattr; + xlator_t *srcvol; + } linkfile; + struct { + uint32_t hole_cnt; + uint32_t overlaps_cnt; + uint32_t down; + uint32_t misc; + dht_selfheal_dir_cbk_t dir_cbk; + dht_selfheal_layout_t healer; + dht_need_heal_t should_heal; + dht_layout_t *layout, *refreshed_layout; + uint32_t missing_cnt; + gf_boolean_t force_mkdir; + } selfheal; + + dht_refresh_layout_unlock refresh_layout_unlock; + dht_refresh_layout_done_handle refresh_layout_done; + + uint32_t uid; + uint32_t gid; + pid_t pid; + + glusterfs_fop_t fop; + + /* need for file-info */ + char *xattr_val; + char *key; + + /* needed by nufa */ + int32_t flags; + mode_t mode; + dev_t rdev; + mode_t umask; + + /* which xattr request? */ + char xsel[256]; + int32_t alloc_len; + + /* gfid related */ + uuid_t gfid; + uuid_t gfid_req; + + xlator_t *link_subvol; + + struct dht_rebalance_ rebalance; + xlator_t *first_up_subvol; + + struct dht_skip_linkto_unlink skip_unlink; + + dht_dir_transaction_t lock[2], *current; + + /* inodelks during filerename for backward compatibility */ + dht_lock_t **rename_inodelk_backward_compatible; + + call_stub_t *stub; + int32_t parent_disk_layout[4]; + + /* rename rollback */ + int *ret_cache; + + loc_t loc2_copy; + + int rename_inodelk_bc_count; + /* This is use only for directory operation */ + int32_t valid; + int32_t mds_heal_fresh_lookup; + short lock_type; + char need_selfheal; + char need_xattr_heal; + char need_attrheal; + /* flag used to make sure we need to return estale in + {lookup,revalidate}_cbk */ + char return_estale; + char need_lookup_everywhere; + /* fd open check */ + gf_boolean_t fd_checked; + gf_boolean_t linked; + gf_boolean_t added_link; + gf_boolean_t is_linkfile; + gf_boolean_t quota_deem_statfs; + gf_boolean_t heal_layout; + gf_boolean_t locked; + gf_boolean_t dont_create_linkto; + gf_boolean_t gfid_missing; }; typedef struct dht_local dht_local_t; /* du - disk-usage */ struct dht_du { - double avail_percent; - uint64_t avail_space; - uint32_t log; + double avail_percent; + double avail_inodes; + uint64_t avail_space; + uint32_t log; + uint32_t chunks; + uint32_t total_blocks; + uint32_t avail_blocks; + uint32_t frsize; /*fragment size*/ }; typedef struct dht_du dht_du_t; +enum gf_defrag_type { + GF_DEFRAG_CMD_NONE = 0, + GF_DEFRAG_CMD_START = 1, + GF_DEFRAG_CMD_STOP = 1 + 1, + GF_DEFRAG_CMD_STATUS = 1 + 2, + GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3, + GF_DEFRAG_CMD_START_FORCE = 1 + 4, + GF_DEFRAG_CMD_DETACH_STATUS = 1 + 11, + GF_DEFRAG_CMD_DETACH_START = 1 + 13, + GF_DEFRAG_CMD_DETACH_COMMIT = 1 + 14, + GF_DEFRAG_CMD_DETACH_COMMIT_FORCE = 1 + 15, + GF_DEFRAG_CMD_DETACH_STOP = 1 + 16, + /* new labels are used so it will help + * while removing old labels by easily differentiating. + * A few labels are added so that the count remains same + * between this enum and the ones on the xdr file. + * different values for the same enum cause errors and + * confusion. + */ +}; +typedef enum gf_defrag_type gf_defrag_type; + +enum gf_defrag_status_t { + GF_DEFRAG_STATUS_NOT_STARTED, + GF_DEFRAG_STATUS_STARTED, + GF_DEFRAG_STATUS_STOPPED, + GF_DEFRAG_STATUS_COMPLETE, + GF_DEFRAG_STATUS_FAILED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED, + GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE, + GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED, +}; +typedef enum gf_defrag_status_t gf_defrag_status_t; + +typedef struct gf_defrag_pattern_list gf_defrag_pattern_list_t; + +struct gf_defrag_pattern_list { + char path_pattern[256]; + uint64_t size; + gf_defrag_pattern_list_t *next; +}; + +struct dht_container { + union { + struct list_head list; + struct { + struct _gf_dirent_t *next; + struct _gf_dirent_t *prev; + }; + }; + gf_dirent_t *df_entry; + xlator_t *this; + loc_t *parent_loc; + dict_t *migrate_data; + int local_subvol_index; +}; + +typedef struct nodeuuid_info { + char info; /* Set to 1 is this is my node's uuid*/ + uuid_t uuid; /* Store the nodeuuid as well for debugging*/ +} nodeuuid_info_t; + +typedef struct subvol_nodeuuids_info { + nodeuuid_info_t *elements; + int count; +} subvol_nodeuuids_info_t; + +struct gf_defrag_info_ { + uint64_t total_files; + uint64_t total_data; + uint64_t num_files_lookedup; + uint64_t total_failures; + uint64_t skipped; + uint64_t num_dirs_processed; + uint64_t size_processed; + gf_lock_t lock; + pthread_t th; + struct rpc_clnt *rpc; + uint32_t connected; + uint32_t is_exiting; + pid_t pid; + int cmd; + inode_t *root_inode; + uuid_t node_uuid; + time_t start_time; + uint32_t new_commit_hash; + gf_defrag_status_t defrag_status; + gf_defrag_pattern_list_t *defrag_pattern; + + pthread_cond_t parallel_migration_cond; + pthread_mutex_t dfq_mutex; + pthread_cond_t rebalance_crawler_alarm; + int32_t q_entry_count; + int32_t global_error; + struct dht_container *queue; + int32_t crawl_done; + int32_t abort; + int32_t wakeup_crawler; + + /*Throttle params*/ + /*stands for reconfigured thread count*/ + int32_t recon_thread_count; + pthread_cond_t df_wakeup_thread; + + /* backpointer to make it easier to write functions for rebalance */ + xlator_t *this; + + pthread_cond_t fc_wakeup_cond; + pthread_mutex_t fc_mutex; + + /*stands for current running thread count*/ + int32_t current_thread_count; + + gf_boolean_t stats; + /* lock migration flag */ + gf_boolean_t lock_migration_enabled; +}; + +typedef struct gf_defrag_info_ gf_defrag_info_t; + +struct dht_methods_s { + int32_t (*migration_get_dst_subvol)(xlator_t *this, dht_local_t *local); + int32_t (*migration_other)(xlator_t *this, gf_defrag_info_t *defrag); + xlator_t *(*layout_search)(xlator_t *this, dht_layout_t *layout, + const char *name); +}; + +typedef struct dht_methods_s dht_methods_t; + struct dht_conf { - gf_lock_t subvolume_lock; - int subvolume_cnt; - xlator_t **subvolumes; - char *subvolume_status; - int *last_event; - dht_layout_t **file_layouts; - dht_layout_t **dir_layouts; - dht_layout_t *default_dir_layout; - gf_boolean_t search_unhashed; - int gen; - dht_du_t *du_stats; - uint64_t min_free_disk; - char disk_unit; - int32_t refresh_interval; - gf_boolean_t unhashed_sticky_bit; - struct timeval last_stat_fetch; - gf_lock_t layout_lock; - void *private; /* Can be used by wrapper xlators over - dht */ - gf_boolean_t use_readdirp; - char vol_uuid[UUID_SIZE + 1]; + xlator_t **subvolumes; + char *subvolume_status; + int *last_event; + dht_layout_t **file_layouts; + dht_layout_t **dir_layouts; + unsigned int search_unhashed; + int gen; + dht_du_t *du_stats; + double min_free_disk; + double min_free_inodes; + int subvolume_cnt; + int32_t refresh_interval; + gf_lock_t subvolume_lock; + time_t last_stat_fetch; + gf_lock_t layout_lock; + dict_t *leaf_to_subvol; + void *private; /* Can be used by wrapper xlators over + dht */ + time_t *subvol_up_time; + + /* to keep track of nodes which are decommissioned */ + xlator_t **decommissioned_bricks; + int decommission_in_progress; + int decommission_subvols_cnt; + + /* defrag related */ + gf_defrag_info_t *defrag; + + /* Support regex-based name reinterpretation. */ + regex_t rsync_regex; + regex_t extra_regex; + + /* Support variable xattr names. */ + char *xattr_name; + char *mds_xattr_key; + char *link_xattr_name; + char *commithash_xattr_name; + char *wild_xattr_name; + + dht_methods_t methods; + + struct mem_pool *lock_pool; + + /*local subvol storage for rebalance*/ + xlator_t **local_subvols; + subvol_nodeuuids_info_t *local_nodeuuids; + int32_t local_subvols_cnt; + + int dthrottle; + + /* Hard link handle requirement for migration triggered from client*/ + synclock_t link_lock; + + /* lock migration */ + gf_lock_t lock; + + /* This is the count used as the distribute layout for a directory */ + /* Will be a global flag to control the layout spread count */ + uint32_t dir_spread_cnt; + + /* + * "Commit hash" for this volume topology. Changed whenever bricks + * are added or removed. + */ + uint32_t vol_commit_hash; + + char vol_uuid[UUID_SIZE + 1]; + + char disk_unit; + + gf_boolean_t lock_migration_enabled; + + gf_boolean_t vch_forced; + + gf_boolean_t use_fallocate; + + gf_boolean_t force_migration; + + gf_boolean_t lookup_optimize; + + gf_boolean_t unhashed_sticky_bit; + + gf_boolean_t assert_no_child_down; + + gf_boolean_t use_readdirp; + + /* Request to filter directory entries in readdir request */ + gf_boolean_t readdir_optimize; + + gf_boolean_t rsync_regex_valid; + + gf_boolean_t extra_regex_valid; + + /* Support size-weighted rebalancing (heterogeneous bricks). */ + gf_boolean_t do_weighting; + + gf_boolean_t randomize_by_gfid; }; typedef struct dht_conf dht_conf_t; +struct dht_dfoffset_ctx { + xlator_t *this; + off_t offset; + int32_t readdir_done; +}; +typedef struct dht_dfoffset_ctx dht_dfoffset_ctx_t; struct dht_disk_layout { - uint32_t cnt; - uint32_t type; - struct { - uint32_t start; - uint32_t stop; - } list[1]; + uint32_t cnt; + uint32_t type; + struct { + uint32_t start; + uint32_t stop; + } list[1]; }; typedef struct dht_disk_layout dht_disk_layout_t; -#define WIPE(statp) do { typeof(*statp) z = {0,}; if (statp) *statp = z; } while (0) +typedef enum { + GF_DHT_MIGRATE_DATA, + GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS, + GF_DHT_MIGRATE_HARDLINK, + GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS +} gf_dht_migrate_data_type_t; -#define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT) +typedef enum { + GF_DHT_EQUAL_DISTRIBUTION, + GF_DHT_WEIGHTED_DISTRIBUTION +} dht_distribution_type_t; -#define is_fs_root(loc) (strcmp (loc->path, "/") == 0) +struct dir_dfmeta { + gf_dirent_t *equeue; + dht_dfoffset_ctx_t *offset_var; + struct list_head **head; + struct list_head **iterator; + int *fetch_entries; + /* fds corresponding to local subvols only */ + fd_t **lfd; +}; + +typedef struct dht_migrate_info { + xlator_t *src_subvol; + xlator_t *dst_subvol; + GF_REF_DECL; +} dht_migrate_info_t; + +typedef struct dht_fd_ctx { + uint64_t opened_on_dst; + GF_REF_DECL; +} dht_fd_ctx_t; -#define is_revalidate(loc) (inode_ctx_get (loc->inode, this, NULL) == 0) +#define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT) + +#define is_revalidate(loc) \ + (dht_inode_ctx_layout_get((loc)->inode, this, NULL) == 0) #define is_last_call(cnt) (cnt == 0) -#define DHT_LINKFILE_MODE (S_ISVTX) -#define check_is_linkfile(i,s,x) ( \ - ((st_mode_from_ia (s->ia_prot, s->ia_type) & ~S_IFMT) \ - == DHT_LINKFILE_MODE) && \ - (s->ia_size == 0)) +#define DHT_MIGRATION_IN_PROGRESS 1 +#define DHT_MIGRATION_COMPLETED 2 + +#define check_is_linkfile(i, s, x, n) \ + (IS_DHT_LINKFILE_MODE(s) && dict_get(x, n)) -#define check_is_dir(i,s,x) (IA_ISDIR(s->ia_type)) +#define IS_DHT_MIGRATION_PHASE2(buf) \ + (IA_ISREG((buf)->ia_type) && \ + ((st_mode_from_ia((buf)->ia_prot, (buf)->ia_type) & ~S_IFMT) == \ + DHT_LINKFILE_MODE)) + +#define IS_DHT_MIGRATION_PHASE1(buf) \ + (IA_ISREG((buf)->ia_type) && ((buf)->ia_prot.sticky == 1) && \ + ((buf)->ia_prot.sgid == 1)) + +#define DHT_STRIP_PHASE1_FLAGS(buf) \ + do { \ + if ((buf) && IS_DHT_MIGRATION_PHASE1(buf)) { \ + (buf)->ia_prot.sticky = 0; \ + (buf)->ia_prot.sgid = 0; \ + } \ + } while (0) + +#define dht_inode_missing(op_errno) (op_errno == ENOENT || op_errno == ESTALE) + +#define check_is_dir(i, s, x) (IA_ISDIR(s->ia_type)) #define layout_is_sane(layout) ((layout) && (layout->cnt > 0)) -#define DHT_STACK_UNWIND(fop, frame, params ...) do { \ - dht_local_t *__local = NULL; \ - xlator_t *__xl = NULL; \ - if (frame) { \ - __xl = frame->this; \ - __local = frame->local; \ - frame->local = NULL; \ - } \ - STACK_UNWIND_STRICT (fop, frame, params); \ - dht_local_wipe (__xl, __local); \ - } while (0) - -#define DHT_STACK_DESTROY(frame) do { \ - dht_local_t *__local = NULL; \ - xlator_t *__xl = NULL; \ - __xl = frame->this; \ - __local = frame->local; \ - frame->local = NULL; \ - STACK_DESTROY (frame->root); \ - dht_local_wipe (__xl, __local); \ - } while (0) - -dht_layout_t *dht_layout_new (xlator_t *this, int cnt); -dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); -dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); -xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout, - const char *name); -int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout); -int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, - uint32_t *holes_p, uint32_t *overlaps_p, - uint32_t *missing_p, uint32_t *down_p, - uint32_t *misc_p); -int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, - xlator_t *subvol, loc_t *loc, dict_t *xattr); - -xlator_t *dht_linkfile_subvol (xlator_t *this, inode_t *inode, - struct iatt *buf, dict_t *xattr); -int dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, - xlator_t *subvol, loc_t *loc); - -int dht_layouts_init (xlator_t *this, dht_conf_t *conf); -int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, - int op_ret, int op_errno, dict_t *xattr); - -int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, - int pos, int32_t **disk_layout_p); -int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, - int pos, void *disk_layout_raw); - - -int dht_frame_return (call_frame_t *frame); - -int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y); -int dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol, - uint64_t *x); - -void dht_local_wipe (xlator_t *this, dht_local_t *local); -dht_local_t *dht_local_init (call_frame_t *frame); -int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt *from, - xlator_t *subvol); - -xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc); -xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode); -xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev); -int dht_subvol_cnt (xlator_t *this, xlator_t *subvol); - -int dht_hash_compute (int type, const char *name, uint32_t *hash_p); - -int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, - xlator_t *tovol, xlator_t *fromvol, loc_t *loc); -int dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc); -int dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc); -int -dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, - loc_t *loc, dht_layout_t *layout); -int -dht_selfheal_new_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, - dht_layout_t *layout); -int -dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, - loc_t *loc, dht_layout_t *layout); -int -dht_layout_sort_volname (dht_layout_t *layout); - -int dht_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); - -int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc); - -int dht_is_subvol_filled (xlator_t *this, xlator_t *subvol); -xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol); -int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx); - -int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode); -int dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout); -void dht_layout_unref (xlator_t *this, dht_layout_t *layout); -dht_layout_t *dht_layout_ref (xlator_t *this, dht_layout_t *layout); -xlator_t *dht_first_up_subvol (xlator_t *this); -xlator_t *dht_last_up_subvol (xlator_t *this); - -int dht_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name); - -int dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, - xlator_t **subvol); +#define we_are_not_migrating(x) ((x) == 1) + +#define DHT_STACK_UNWIND(fop, frame, params...) \ + do { \ + dht_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + if (frame) { \ + __xl = frame->this; \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + dht_local_wipe(__xl, __local); \ + } while (0) + +#define DHT_STACK_DESTROY(frame) \ + do { \ + dht_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + __xl = frame->this; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_DESTROY(frame->root); \ + dht_local_wipe(__xl, __local); \ + } while (0) + +#define DHT_UPDATE_TIME(ctx_sec, ctx_nsec, new_sec, new_nsec, post) \ + do { \ + if (ctx_sec == new_sec) \ + new_nsec = max(new_nsec, ctx_nsec); \ + else if (ctx_sec > new_sec) { \ + new_sec = ctx_sec; \ + new_nsec = ctx_nsec; \ + } \ + if (post) { \ + ctx_sec = new_sec; \ + ctx_nsec = new_nsec; \ + } \ + } while (0) + +#define is_greater_time(a, an, b, bn) \ + (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))) + +#define DHT_MARK_FOP_INTERNAL(xattr) \ + do { \ + int tmp = -1; \ + if (!xattr) { \ + xattr = dict_new(); \ + if (!xattr) \ + break; \ + } \ + tmp = dict_set_str(xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \ + if (tmp) { \ + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, \ + "Failed to set dictionary value: key = %s," \ + " path = %s", \ + GLUSTERFS_INTERNAL_FOP_KEY, local->loc.path); \ + } \ + } while (0) + +dht_layout_t * +dht_layout_new(xlator_t *this, int cnt); +dht_layout_t * +dht_layout_get(xlator_t *this, inode_t *inode); +dht_layout_t * +dht_layout_for_subvol(xlator_t *this, xlator_t *subvol); +xlator_t * +dht_layout_search(xlator_t *this, dht_layout_t *layout, const char *name); +int32_t +dht_migration_get_dst_subvol(xlator_t *this, dht_local_t *local); +int32_t +dht_migration_needed(xlator_t *this); +int +dht_layout_normalize(xlator_t *this, loc_t *loc, dht_layout_t *layout); +void +dht_layout_anomalies(xlator_t *this, loc_t *loc, dht_layout_t *layout, + uint32_t *holes_p, uint32_t *overlaps_p, + uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p, + uint32_t *no_space_p); +int +dht_layout_dir_mismatch(xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + loc_t *loc, dict_t *xattr); +xlator_t * +dht_linkfile_subvol(xlator_t *this, inode_t *inode, struct iatt *buf, + dict_t *xattr); +int +dht_linkfile_unlink(call_frame_t *frame, xlator_t *this, xlator_t *subvol, + loc_t *loc); + +int +dht_layouts_init(xlator_t *this, dht_conf_t *conf); +int +dht_layout_merge(xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + int op_ret, int op_errno, dict_t *xattr); + +int +dht_disk_layout_extract(xlator_t *this, dht_layout_t *layout, int pos, + int32_t **disk_layout_p); +int +dht_disk_layout_extract_for_subvol(xlator_t *this, dht_layout_t *layout, + xlator_t *subvol, int32_t **disk_layout_p); + +int +dht_frame_return(call_frame_t *frame); + +int +dht_deitransform(xlator_t *this, uint64_t y, xlator_t **subvol); + +void +dht_local_wipe(xlator_t *this, dht_local_t *local); +dht_local_t * +dht_local_init(call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop); +int +dht_iatt_merge(xlator_t *this, struct iatt *to, struct iatt *from); + +xlator_t * +dht_subvol_get_hashed(xlator_t *this, loc_t *loc); +xlator_t * +dht_subvol_get_cached(xlator_t *this, inode_t *inode); +xlator_t * +dht_subvol_next(xlator_t *this, xlator_t *prev); +xlator_t * +dht_subvol_next_available(xlator_t *this, xlator_t *prev); +int +dht_subvol_cnt(xlator_t *this, xlator_t *subvol); + +int +dht_hash_compute(xlator_t *this, int type, const char *name, uint32_t *hash_p); + +int +dht_linkfile_create(call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, + xlator_t *this, xlator_t *tovol, xlator_t *fromvol, + loc_t *loc); +int +dht_lookup_everywhere(call_frame_t *frame, xlator_t *this, loc_t *loc); +int +dht_selfheal_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, + loc_t *loc, dht_layout_t *layout); +int +dht_selfheal_new_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, + dht_layout_t *layout); +int +dht_selfheal_restore(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, + loc_t *loc, dht_layout_t *layout); +void +dht_layout_sort_volname(dht_layout_t *layout); + +int +dht_get_du_info(call_frame_t *frame, xlator_t *this, loc_t *loc); + +gf_boolean_t +dht_is_subvol_filled(xlator_t *this, xlator_t *subvol); +xlator_t * +dht_free_disk_available_subvol(xlator_t *this, xlator_t *subvol, + dht_local_t *layout); +int +dht_get_du_info_for_subvol(xlator_t *this, int subvol_idx); + +int +dht_layout_preset(xlator_t *this, xlator_t *subvol, inode_t *inode); +int +dht_layout_set(xlator_t *this, inode_t *inode, dht_layout_t *layout); +; +void +dht_layout_unref(xlator_t *this, dht_layout_t *layout); +dht_layout_t * +dht_layout_ref(xlator_t *this, dht_layout_t *layout); +int +dht_layout_index_for_subvol(dht_layout_t *layout, xlator_t *subvol); +xlator_t * +dht_first_up_subvol(xlator_t *this); +xlator_t * +dht_last_up_subvol(xlator_t *this); + +int +dht_build_child_loc(xlator_t *this, loc_t *child, loc_t *parent, char *name); + +int +dht_filter_loc_subvol_key(xlator_t *this, loc_t *loc, loc_t *new_loc, + xlator_t **subvol); + +int +dht_rename_cleanup(call_frame_t *frame); +int +dht_rename_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); + +int +dht_update_commit_hash_for_layout(call_frame_t *frame); +int +dht_fix_directory_layout(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + dht_layout_t *layout); + +int +dht_init_subvolumes(xlator_t *this, dht_conf_t *conf); + +/* migration/rebalance */ +int +dht_start_rebalance_task(xlator_t *this, call_frame_t *frame); + +int +dht_rebalance_in_progress_check(xlator_t *this, call_frame_t *frame); +int +dht_rebalance_complete_check(xlator_t *this, call_frame_t *frame); + +int +dht_init_local_subvolumes(xlator_t *this, dht_conf_t *conf); + +/* FOPS */ +int32_t +dht_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req); + +int32_t +dht_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int32_t +dht_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); + +int32_t +dht_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata); + +int32_t +dht_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata); + +int32_t +dht_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata); + +int32_t +dht_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata); + +int32_t +dht_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata); + +int32_t +dht_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata); + +int32_t +dht_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata); + +int32_t +dht_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata); + +int32_t +dht_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata); + +int32_t +dht_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); + +int32_t +dht_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); + +int32_t +dht_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *params); + +int32_t +dht_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata); + +int32_t +dht_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); + +int32_t +dht_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata); + +int32_t +dht_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); + +int32_t +dht_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata); + +int32_t +dht_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata); + +int32_t +dht_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata); + +int32_t +dht_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int32_t +dht_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata); + +int32_t +dht_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata); + +int32_t +dht_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata); + +int32_t +dht_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata); + +int32_t +dht_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata); +int32_t +dht_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata); + +int32_t +dht_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata); + +int32_t +dht_lease(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct gf_lease *lease, dict_t *xdata); + +int32_t +dht_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + int32_t cmd, struct gf_flock *flock, dict_t *xdata); + +int32_t +dht_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata); + +int32_t +dht_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata); + +int32_t +dht_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata); + +int32_t +dht_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata); + +int32_t +dht_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict); + +int32_t +dht_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); + +int32_t +dht_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); + +int32_t +dht_forget(xlator_t *this, inode_t *inode); +int32_t +dht_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata); +int32_t +dht_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata); +int32_t +dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata); +int32_t +dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); +int32_t +dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata); +int32_t +dht_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata); + +int +dht_set_subvol_range(xlator_t *this); +int32_t +dht_init(xlator_t *this); +void +dht_fini(xlator_t *this); +int +dht_reconfigure(xlator_t *this, dict_t *options); +int32_t +dht_notify(xlator_t *this, int32_t event, void *data, ...); + +/* definitions for nufa/switch */ +int +dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent); +int +dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent); +int +dht_lookup_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent); +int +dht_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent); +int +dht_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata); +int +dht_newfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata); + +int +dht_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +dht_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xattr, dict_t *xdata); + +int +dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata); +int +gf_defrag_status_get(dht_conf_t *conf, dict_t *dict); + +int +gf_defrag_stop(dht_conf_t *conf, gf_defrag_status_t status, dict_t *output); + +void * +gf_defrag_start(void *this); + +int32_t +gf_defrag_handle_hardlink(xlator_t *this, loc_t *loc, int *fop_errno); +int +dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, + int flag, int *fop_errno); +int +dht_inode_ctx_layout_get(inode_t *inode, xlator_t *this, + dht_layout_t **layout_int); +int +dht_inode_ctx_layout_set(inode_t *inode, xlator_t *this, + dht_layout_t *layout_int); +int +dht_inode_ctx_time_update(inode_t *inode, xlator_t *this, struct iatt *stat, + int32_t update_ctx); +void +dht_inode_ctx_time_set(inode_t *inode, xlator_t *this, struct iatt *stat); + +int +dht_inode_ctx_get(inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx); +int +dht_inode_ctx_set(inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx); +int +dht_dir_attr_heal(void *data); +int +dht_dir_attr_heal_done(int ret, call_frame_t *sync_frame, void *data); +xlator_t * +dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, + xlator_t *ignore, dht_layout_t *layout, + uint64_t filesize); +xlator_t * +dht_subvol_maxspace_nonzeroinode(xlator_t *this, xlator_t *subvol, + dht_layout_t *layout); +int +dht_dir_has_layout(dict_t *xattr, char *name); +int +dht_linkfile_attr_heal(call_frame_t *frame, xlator_t *this); + +int32_t +dht_priv_dump(xlator_t *this); +int32_t +dht_inodectx_dump(xlator_t *this, inode_t *inode); + +gf_boolean_t +dht_is_subvol_in_layout(dht_layout_t *layout, xlator_t *xlator); + +int +dht_inode_ctx_get_mig_info(xlator_t *this, inode_t *inode, + xlator_t **src_subvol, xlator_t **dst_subvol); +gf_boolean_t +dht_mig_info_is_invalid(xlator_t *current, xlator_t *src_subvol, + xlator_t *dst_subvol); + +int +dht_subvol_status(dht_conf_t *conf, xlator_t *subvol); + +void +dht_log_new_layout_for_dir_selfheal(xlator_t *this, loc_t *loc, + dht_layout_t *layout); +int +dht_layout_sort(dht_layout_t *layout); + +int +dht_heal_full_path(void *data); + +int +dht_heal_full_path_done(int op_ret, call_frame_t *frame, void *data); + +int +dht_layout_missing_dirs(dht_layout_t *layout); + +int +dht_refresh_layout(call_frame_t *frame); + +int +dht_build_parent_loc(xlator_t *this, loc_t *parent, loc_t *child, + int32_t *op_errno); + +int32_t +dht_set_local_rebalance(xlator_t *this, dht_local_t *local, struct iatt *stbuf, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); +void +dht_build_root_loc(inode_t *inode, loc_t *loc); + +gf_boolean_t +dht_fd_open_on_dst(xlator_t *this, fd_t *fd, xlator_t *dst); + +int32_t +dht_fd_ctx_destroy(xlator_t *this, fd_t *fd); + +int32_t +dht_release(xlator_t *this, fd_t *fd); + +int32_t +dht_set_fixed_dir_stat(struct iatt *stat); + +xlator_t * +dht_get_lock_subvolume(xlator_t *this, struct gf_flock *lock, + dht_local_t *local); + +int +dht_lk_inode_unref(call_frame_t *frame, int32_t op_ret); + +int +dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *subvol); + +int +dht_check_and_open_fd_on_subvol(xlator_t *this, call_frame_t *frame); + +/* FD fop callbacks */ + +int +dht_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); + +int +dht_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xdata); + +int +dht_file_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata); + +int +dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); + +int +dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); + +int +dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); + +int +dht_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); + +int +dht_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); + +int +dht_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iovec *vector, int count, struct iatt *stbuf, + struct iobref *iobref, dict_t *xdata); + +int +dht_file_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *stbuf, dict_t *xdata); + +int +dht_file_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata); + +int +dht_file_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata); + +/* All custom xattr heal functions */ +int +dht_dir_heal_xattrs(void *data); + +int +dht_dir_heal_xattrs_done(int ret, call_frame_t *sync_frame, void *data); + +int32_t +dht_dict_set_array(dict_t *dict, char *key, int32_t value[], int32_t size); + +int +dht_set_user_xattr(dict_t *dict, char *k, data_t *v, void *data); + +void +dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst, + dict_t *src, int *uret, int *uflag); + +int +dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno); + +int +dht_common_mark_mdsxattr(call_frame_t *frame, int *errst, int flag); + +int +dht_inode_ctx_mdsvol_get(inode_t *inode, xlator_t *this, xlator_t **mdsvol); + +int +dht_selfheal_dir_setattr(call_frame_t *frame, loc_t *loc, struct iatt *stbuf, + int32_t valid, dht_layout_t *layout); + +/* Abstract out the DHT-IATT-IN-DICT */ + +void +dht_selfheal_layout_new_directory(call_frame_t *frame, loc_t *loc, + dht_layout_t *new_layout); + +int +dht_pt_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, + dict_t *xdata); + +int +dht_pt_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *xdata); + +int32_t +dht_pt_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata); + +int +dht_pt_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); + +int32_t +dht_check_remote_fd_failed_error(dht_local_t *local, int op_ret, int op_errno); + +int +dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata); + +int32_t +dht_create_lock(call_frame_t *frame, xlator_t *subvol); + +int +dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local); + +int +dht_dir_layout_error_check(xlator_t *this, inode_t *inode); + +int +dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol); #endif /* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 75953781ef7..c0588828fdb 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -1,258 +1,487 @@ /* - Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - /* TODO: add NS locking */ -#include "glusterfs.h" -#include "xlator.h" #include "dht-common.h" -#include "defaults.h" #include <sys/time.h> - +#include <glusterfs/events.h> int -dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct statvfs *statvfs) +dht_du_info_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *statvfs, dict_t *xdata) { - dht_conf_t *conf = NULL; - call_frame_t *prev = NULL; - int this_call_cnt = 0; - int i = 0; - double percent = 0; - uint64_t bytes = 0; - - conf = this->private; - prev = cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to get disk info from %s", prev->this->name); - goto out; - } - - if (statvfs && statvfs->f_blocks) { - percent = (statvfs->f_bfree * 100) / statvfs->f_blocks; - bytes = (statvfs->f_bfree * statvfs->f_frsize); - } - - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) - if (prev->this == conf->subvolumes[i]) { - conf->du_stats[i].avail_percent = percent; - conf->du_stats[i].avail_space = bytes; - gf_log (this->name, GF_LOG_DEBUG, - "on subvolume '%s': avail_percent is: " - "%.2f and avail_space is: %"PRIu64"", - prev->this->name, - conf->du_stats[i].avail_percent, - conf->du_stats[i].avail_space); - } - } - UNLOCK (&conf->subvolume_lock); + dht_conf_t *conf = NULL; + xlator_t *prev = NULL; + int this_call_cnt = 0; + int i = 0; + double percent = 0; + double percent_inodes = 0; + uint64_t bytes = 0; + uint32_t bpc; /* blocks per chunk */ + uint32_t chunks = 0; + + conf = this->private; + prev = cookie; + + if (op_ret == -1 || !statvfs) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, + DHT_MSG_GET_DISK_INFO_ERROR, "failed to get disk info from %s", + prev->name); + goto out; + } + + if (statvfs->f_blocks) { + percent = (statvfs->f_bavail * 100) / statvfs->f_blocks; + bytes = (statvfs->f_bavail * statvfs->f_frsize); + /* + * A 32-bit count of 1MB chunks allows a maximum brick size of + * ~4PB. It's possible that we could see a single local FS + * bigger than that some day, but this code is likely to be + * irrelevant by then. Meanwhile, it's more important to keep + * the chunk size small so the layout-calculation code that + * uses this value can be tested on normal machines. + */ + bpc = (1 << 20) / statvfs->f_bsize; + chunks = (statvfs->f_blocks + bpc - 1) / bpc; + } + + if (statvfs->f_files) { + percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files; + } else { + /* + * Set percent inodes to 100 for dynamically allocated inode + * filesystems. The rationale is that distribute need not + * worry about total inodes; rather, let the 'create()' be + * scheduled on the hashed subvol regardless of the total + * inodes. + */ + percent_inodes = 100; + } + + LOCK(&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) + if (prev == conf->subvolumes[i]) { + conf->du_stats[i].avail_percent = percent; + conf->du_stats[i].avail_space = bytes; + conf->du_stats[i].avail_inodes = percent_inodes; + conf->du_stats[i].chunks = chunks; + conf->du_stats[i].total_blocks = statvfs->f_blocks; + conf->du_stats[i].avail_blocks = statvfs->f_bavail; + conf->du_stats[i].frsize = statvfs->f_frsize; + + gf_msg_debug(this->name, 0, + "subvolume '%s': avail_percent " + "is: %.2f and avail_space " + "is: %" PRIu64 + " and avail_inodes" + " is: %.2f", + prev->name, conf->du_stats[i].avail_percent, + conf->du_stats[i].avail_space, + conf->du_stats[i].avail_inodes); + break; /* no point in looping further */ + } + } + UNLOCK(&conf->subvolume_lock); out: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_DESTROY (frame); + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) + DHT_STACK_DESTROY(frame); - return 0; + return 0; } int -dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx) +dht_get_du_info_for_subvol(xlator_t *this, int subvol_idx) { - dht_conf_t *conf = NULL; - call_frame_t *statfs_frame = NULL; - dht_local_t *statfs_local = NULL; - call_pool_t *pool = NULL; + dht_conf_t *conf = NULL; + call_frame_t *statfs_frame = NULL; + dht_local_t *statfs_local = NULL; + call_pool_t *pool = NULL; + loc_t tmp_loc = { + 0, + }; + + conf = this->private; + pool = this->ctx->pool; + + statfs_frame = create_frame(this, pool); + if (!statfs_frame) { + goto err; + } + + /* local->fop value is not used in this case */ + statfs_local = dht_local_init(statfs_frame, NULL, NULL, GF_FOP_MAXVALUE); + if (!statfs_local) { + goto err; + } + + /* make it root gfid, should be enough to get the proper info back */ + tmp_loc.gfid[15] = 1; + + statfs_local->call_cnt = 1; + STACK_WIND_COOKIE( + statfs_frame, dht_du_info_cbk, conf->subvolumes[subvol_idx], + conf->subvolumes[subvol_idx], + conf->subvolumes[subvol_idx]->fops->statfs, &tmp_loc, NULL); + + return 0; +err: + if (statfs_frame) + DHT_STACK_DESTROY(statfs_frame); - conf = this->private; - pool = this->ctx->pool; + return -1; +} - statfs_frame = create_frame (this, pool); +int +dht_get_du_info(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + int i = 0; + int ret = -1; + dht_conf_t *conf = NULL; + call_frame_t *statfs_frame = NULL; + dht_local_t *statfs_local = NULL; + loc_t tmp_loc = { + 0, + }; + time_t now; + + conf = this->private; + now = gf_time(); + /* make it root gfid, should be enough to get the proper + info back */ + tmp_loc.gfid[15] = 1; + + if (now > (conf->refresh_interval + conf->last_stat_fetch)) { + statfs_frame = copy_frame(frame); if (!statfs_frame) { - goto err; + goto err; } - statfs_local = dht_local_init (statfs_frame); + /* In this case, 'local->fop' is not used */ + statfs_local = dht_local_init(statfs_frame, loc, NULL, GF_FOP_MAXVALUE); if (!statfs_local) { - goto err; + goto err; } - loc_t tmp_loc = { .inode = NULL, - .path = "/", - }; + statfs_local->params = dict_new(); + if (!statfs_local->params) + goto err; + + ret = dict_set_int8(statfs_local->params, + GF_INTERNAL_IGNORE_DEEM_STATFS, 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set " GF_INTERNAL_IGNORE_DEEM_STATFS " in dict"); + goto err; + } - statfs_local->call_cnt = 1; - STACK_WIND (statfs_frame, dht_du_info_cbk, - conf->subvolumes[subvol_idx], - conf->subvolumes[subvol_idx]->fops->statfs, - &tmp_loc); + statfs_local->call_cnt = conf->subvolume_cnt; + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND_COOKIE(statfs_frame, dht_du_info_cbk, + conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->statfs, &tmp_loc, + statfs_local->params); + } - return 0; + conf->last_stat_fetch = now; + } + return 0; err: - if (statfs_frame) - DHT_STACK_DESTROY (statfs_frame); + if (statfs_frame) + DHT_STACK_DESTROY(statfs_frame); - return -1; + return -1; } -int -dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc) +gf_boolean_t +dht_is_subvol_filled(xlator_t *this, xlator_t *subvol) { - int i = 0; - dht_conf_t *conf = NULL; - call_frame_t *statfs_frame = NULL; - dht_local_t *statfs_local = NULL; - struct timeval tv = {0,}; - - conf = this->private; - - gettimeofday (&tv, NULL); - if (tv.tv_sec > (conf->refresh_interval - + conf->last_stat_fetch.tv_sec)) { - - statfs_frame = copy_frame (frame); - if (!statfs_frame) { - goto err; + int i = 0; + char vol_name[256]; + dht_conf_t *conf = NULL; + gf_boolean_t subvol_filled_inodes = _gf_false; + gf_boolean_t subvol_filled_space = _gf_false; + gf_boolean_t is_subvol_filled = _gf_false; + double usage = 0; + + conf = this->private; + + /* Check for values above specified percent or free disk */ + LOCK(&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + if (conf->disk_unit == 'p') { + if (conf->du_stats[i].avail_percent < conf->min_free_disk) { + subvol_filled_space = _gf_true; + break; + } + + } else { + if (conf->du_stats[i].avail_space < conf->min_free_disk) { + subvol_filled_space = _gf_true; + break; + } } - - statfs_local = dht_local_init (statfs_frame); - if (!statfs_local) { - goto err; + if (conf->du_stats[i].avail_inodes < conf->min_free_inodes) { + subvol_filled_inodes = _gf_true; + break; } + } + } + } + UNLOCK(&conf->subvolume_lock); - loc_copy (&statfs_local->loc, loc); - loc_t tmp_loc = { .inode = NULL, - .path = "/", - }; - - statfs_local->call_cnt = conf->subvolume_cnt; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (statfs_frame, dht_du_info_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->statfs, - &tmp_loc); - } + if (subvol_filled_space && conf->subvolume_status[i]) { + if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { + usage = 100 - conf->du_stats[i].avail_percent; - conf->last_stat_fetch.tv_sec = tv.tv_sec; + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_INSUFF_SPACE, + "disk space on subvolume '%s' is getting " + "full (%.2f %%), consider adding more bricks", + subvol->name, usage); + + (void)snprintf(vol_name, sizeof(vol_name), "%s", this->name); + vol_name[(strlen(this->name) - 4)] = '\0'; + + gf_event(EVENT_DHT_DISK_USAGE, "volume=%s;subvol=%s;usage=%.2f %%", + vol_name, subvol->name, usage); } - return 0; -err: - if (statfs_frame) - DHT_STACK_DESTROY (statfs_frame); + } + + if (subvol_filled_inodes && conf->subvolume_status[i]) { + if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { + usage = 100 - conf->du_stats[i].avail_inodes; + gf_msg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_SUBVOL_INSUFF_INODES, + "inodes on subvolume '%s' are at " + "(%.2f %%), consider adding more bricks", + subvol->name, usage); + + (void)snprintf(vol_name, sizeof(vol_name), "%s", this->name); + vol_name[(strlen(this->name) - 4)] = '\0'; + + gf_event(EVENT_DHT_INODES_USAGE, + "volume=%s;subvol=%s;usage=%.2f %%", vol_name, + subvol->name, usage); + } + } - return -1; + is_subvol_filled = (subvol_filled_space || subvol_filled_inodes); + + return is_subvol_filled; } +/*Get the best subvolume to create the file in*/ +xlator_t * +dht_free_disk_available_subvol(xlator_t *this, xlator_t *subvol, + dht_local_t *local) +{ + xlator_t *avail_subvol = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + loc_t *loc = NULL; + + conf = this->private; + if (!local) + goto out; + loc = &local->loc; + if (!local->layout) { + layout = dht_layout_get(this, loc->parent); + + if (!layout) { + gf_msg_debug(this->name, 0, + "Missing layout. path=%s," + " parent gfid = %s", + loc->path, uuid_utoa(loc->parent->gfid)); + goto out; + } + } else { + layout = dht_layout_ref(this, local->layout); + } + + LOCK(&conf->subvolume_lock); + { + avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, NULL, + layout, 0); + if (!avail_subvol) { + avail_subvol = dht_subvol_maxspace_nonzeroinode(this, subvol, + layout); + } + } + UNLOCK(&conf->subvolume_lock); +out: + if (!avail_subvol) { + gf_msg_debug(this->name, 0, + "No subvolume has enough free space \ + and/or inodes to create"); + avail_subvol = subvol; + } + + if (layout) + dht_layout_unref(this, layout); + return avail_subvol; +} -int -dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) +static inline int32_t +dht_subvol_has_err(dht_conf_t *conf, xlator_t *this, xlator_t *ignore, + dht_layout_t *layout) { - int i = 0; - int subvol_filled = 0; - dht_conf_t *conf = NULL; - - conf = this->private; - - /* Check for values above specified percent or free disk */ - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - if (conf->disk_unit == 'p') { - if (conf->du_stats[i].avail_percent < - conf->min_free_disk) { - subvol_filled = 1; - break; - } - } else { - if (conf->du_stats[i].avail_space < - conf->min_free_disk) { - subvol_filled = 1; - break; - } - } - } - } + int ret = -1; + int i = 0; + + if (!this || !layout) + goto out; + + /* this check is meant for rebalance process. The source of the file + * should be ignored for space check */ + if (this == ignore) { + goto out; + } + + /* check if subvol has layout errors, before selecting it */ + for (i = 0; i < layout->cnt; i++) { + if (!strcmp(layout->list[i].xlator->name, this->name) && + (layout->list[i].err != 0)) { + ret = -1; + goto out; } - UNLOCK (&conf->subvolume_lock); - - if (subvol_filled && conf->subvolume_status[i]) { - if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { - gf_log (this->name, GF_LOG_WARNING, - "disk space on subvolume '%s' is getting " - "full (%.2f %%), consider adding more nodes", - subvol->name, - (100 - conf->du_stats[i].avail_percent)); - } + } + + /* discard decommissioned subvol */ + if (conf->decommission_subvols_cnt) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i] && + conf->decommissioned_bricks[i] == this) { + ret = -1; + goto out; + } } + } - return subvol_filled; + ret = 0; +out: + return ret; } +/*Get subvolume which has both space and inodes more than the min criteria*/ xlator_t * -dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol) +dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, + xlator_t *ignore, dht_layout_t *layout, + uint64_t filesize) { - int i = 0; - double max= 0; - xlator_t *avail_subvol = NULL; - dht_conf_t *conf = NULL; - - conf = this->private; - - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->disk_unit == 'p') { - if (conf->du_stats[i].avail_percent > max) { - max = conf->du_stats[i].avail_percent; - avail_subvol = conf->subvolumes[i]; - } - } else { - if (conf->du_stats[i].avail_space > max) { - max = conf->du_stats[i].avail_space; - avail_subvol = conf->subvolumes[i]; - } - } - } + int i = 0; + double max = 0; + double max_inodes = 0; + int ignore_subvol = 0; + uint64_t total_blocks = 0; + uint64_t avail_blocks = 0; + uint64_t frsize = 0; + double post_availspace = 0; + double post_percent = 0; + + xlator_t *avail_subvol = NULL; + dht_conf_t *conf = NULL; + + conf = this->private; + + for (i = 0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors and also it is not a + * decommissioned brick, before selecting it */ + ignore_subvol = dht_subvol_has_err(conf, conf->subvolumes[i], ignore, + layout); + if (ignore_subvol) + continue; + + if ((conf->disk_unit == 'p') && + (conf->du_stats[i].avail_percent > conf->min_free_disk) && + (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { + if ((conf->du_stats[i].avail_inodes > max_inodes) || + (conf->du_stats[i].avail_percent > max)) { + max = conf->du_stats[i].avail_percent; + max_inodes = conf->du_stats[i].avail_inodes; + avail_subvol = conf->subvolumes[i]; + total_blocks = conf->du_stats[i].total_blocks; + avail_blocks = conf->du_stats[i].avail_blocks; + frsize = conf->du_stats[i].frsize; + } } - UNLOCK (&conf->subvolume_lock); - if (!avail_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume has enough free space to create"); + if ((conf->disk_unit != 'p') && + (conf->du_stats[i].avail_space > conf->min_free_disk) && + (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { + if ((conf->du_stats[i].avail_inodes > max_inodes) || + (conf->du_stats[i].avail_space > max)) { + max = conf->du_stats[i].avail_space; + max_inodes = conf->du_stats[i].avail_inodes; + avail_subvol = conf->subvolumes[i]; + } } + } + + if (avail_subvol) { + if (conf->disk_unit == 'p') { + post_availspace = (avail_blocks * frsize) - filesize; + post_percent = (post_availspace * 100) / (total_blocks * frsize); + if (post_percent < conf->min_free_disk) + avail_subvol = NULL; + } + if (conf->disk_unit != 'p') { + if ((max - filesize) < conf->min_free_disk) + avail_subvol = NULL; + } + } - if (max < conf->min_free_disk) - avail_subvol = subvol; + return avail_subvol; +} - if (!avail_subvol) - avail_subvol = subvol; +/* Get subvol which has at least one inode and maximum space */ +xlator_t * +dht_subvol_maxspace_nonzeroinode(xlator_t *this, xlator_t *subvol, + dht_layout_t *layout) +{ + int i = 0; + double max = 0; + int ignore_subvol = 0; + + xlator_t *avail_subvol = NULL; + dht_conf_t *conf = NULL; + + conf = this->private; + + for (i = 0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors and also it is not a + * decommissioned brick, before selecting it*/ + + ignore_subvol = dht_subvol_has_err(conf, conf->subvolumes[i], NULL, + layout); + if (ignore_subvol) + continue; + + if (conf->disk_unit == 'p') { + if ((conf->du_stats[i].avail_percent > max) && + (conf->du_stats[i].avail_inodes > 0)) { + max = conf->du_stats[i].avail_percent; + avail_subvol = conf->subvolumes[i]; + } + } else { + if ((conf->du_stats[i].avail_space > max) && + (conf->du_stats[i].avail_inodes > 0)) { + max = conf->du_stats[i].avail_space; + avail_subvol = conf->subvolumes[i]; + } + } + } - return avail_subvol; + return avail_subvol; } diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c index 99bb13265b7..acda67c312a 100644 --- a/xlators/cluster/dht/src/dht-hashfn.c +++ b/xlators/cluster/dht/src/dht-hashfn.c @@ -1,81 +1,110 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - - -#include "glusterfs.h" -#include "xlator.h" #include "dht-common.h" -#include "hashfn.h" - +#include <glusterfs/hashfn.h> -int -dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p) +static int +dht_hash_compute_internal(int type, const char *name, const int len, + uint32_t *hash_p) { - int ret = 0; - uint32_t hash = 0; + int ret = 0; + uint32_t hash = 0; - switch (type) { + switch (type) { case DHT_HASH_TYPE_DM: - hash = gf_dm_hashfn (name, strlen (name)); - break; + case DHT_HASH_TYPE_DM_USER: + hash = gf_dm_hashfn(name, len); + break; default: - ret = -1; - break; - } + ret = -1; + break; + } - if (ret == 0) { - *hash_p = hash; - } + if (ret == 0) { + *hash_p = hash; + } - return ret; + return ret; } +/* The function returns: + * 0 : in case no munge took place + * >0 : the length (inc. terminating NULL!) of the newly modified string, + * if it was munged. + */ +static int +dht_munge_name(const char *original, char *modified, size_t len, regex_t *re) +{ + regmatch_t matches[2] = { + {0}, + }; + size_t new_len = 0; + int ret = 0; + + ret = regexec(re, original, 2, matches, 0); + + if (ret != REG_NOMATCH) { + if (matches[1].rm_so != -1) { + new_len = matches[1].rm_eo - matches[1].rm_so; + /* Equal would fail due to the NUL at the end. */ + if (new_len < len) { + memcpy(modified, original + matches[1].rm_so, new_len); + modified[new_len] = '\0'; + return new_len + 1; /* +1 for the terminating NULL */ + } + } + } -#define MAKE_RSYNC_FRIENDLY_NAME(rsync_frndly_name, name) do { \ - rsync_frndly_name = (char *) name; \ - if (name[0] == '.') { \ - char *dot = 0; \ - int namelen = 0; \ - \ - dot = strrchr (name, '.'); \ - if (dot && dot > (name + 1) && *(dot + 1)) { \ - namelen = (dot - name); \ - rsync_frndly_name = alloca (namelen); \ - strncpy (rsync_frndly_name, name + 1, \ - namelen); \ - rsync_frndly_name[namelen - 1] = 0; \ - } \ - } \ - } while (0); - + /* This is guaranteed safe because of how the dest was allocated. */ + strcpy(modified, original); + return 0; +} int -dht_hash_compute (int type, const char *name, uint32_t *hash_p) +dht_hash_compute(xlator_t *this, int type, const char *name, uint32_t *hash_p) { - char *rsync_friendly_name = NULL; + char *rsync_friendly_name = NULL; + dht_conf_t *priv = NULL; + size_t len = 0; + int munged = 0; + + priv = this->private; - MAKE_RSYNC_FRIENDLY_NAME (rsync_friendly_name, name); + if (name == NULL) + return -1; - return dht_hash_compute_internal (type, rsync_friendly_name, hash_p); + len = strlen(name) + 1; + rsync_friendly_name = alloca(len); + + LOCK(&priv->lock); + { + if (priv->extra_regex_valid) { + munged = dht_munge_name(name, rsync_friendly_name, len, + &priv->extra_regex); + } + + if (!munged && priv->rsync_regex_valid) { + gf_msg_trace(this->name, 0, "trying regex for %s", name); + munged = dht_munge_name(name, rsync_friendly_name, len, + &priv->rsync_regex); + } + } + UNLOCK(&priv->lock); + if (munged) { + gf_msg_debug(this->name, 0, "munged down to %s", rsync_friendly_name); + len = munged; + } else { + rsync_friendly_name = (char *)name; + } + + return dht_hash_compute_internal(type, rsync_friendly_name, len - 1, + hash_p); } diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index cd57b9ea083..3f2fe43d5f3 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -1,493 +1,2304 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. +#include "dht-common.h" +#include "dht-lock.h" +#include "glusterfs/compat-errno.h" // for ENODATA on BSD - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ +static void +dht_free_fd_ctx(dht_fd_ctx_t *fd_ctx) +{ + GF_FREE(fd_ctx); +} -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif +int32_t +dht_fd_ctx_destroy(xlator_t *this, fd_t *fd) +{ + dht_fd_ctx_t *fd_ctx = NULL; + uint64_t value = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + ret = fd_ctx_del(fd, this, &value); + if (ret) { + goto out; + } + + fd_ctx = (dht_fd_ctx_t *)(uintptr_t)value; + if (fd_ctx) { + GF_REF_PUT(fd_ctx); + } +out: + return ret; +} +static int +__dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *dst) +{ + dht_fd_ctx_t *fd_ctx = NULL; + uint64_t value = 0; + int ret = -1; -#include "glusterfs.h" -#include "xlator.h" -#include "dht-common.h" + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + fd_ctx = GF_CALLOC(1, sizeof(*fd_ctx), gf_dht_mt_fd_ctx_t); + if (!fd_ctx) { + goto out; + } + + fd_ctx->opened_on_dst = (uint64_t)(uintptr_t)dst; + GF_REF_INIT(fd_ctx, dht_free_fd_ctx); + + value = (uint64_t)(uintptr_t)fd_ctx; + + ret = __fd_ctx_set(fd, this, value); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FD_CTX_SET_FAILED, + "fd=0x%p", fd, NULL); + GF_REF_PUT(fd_ctx); + } +out: + return ret; +} int -dht_frame_return (call_frame_t *frame) +dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *dst) { - dht_local_t *local = NULL; - int this_call_cnt = -1; + dht_fd_ctx_t *fd_ctx = NULL; + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + LOCK(&fd->lock); + { + ret = __fd_ctx_get(fd, this, &value); + if (ret && value) { + fd_ctx = (dht_fd_ctx_t *)(uintptr_t)value; + if (fd_ctx->opened_on_dst == (uint64_t)(uintptr_t)dst) { + /* This could happen due to racing + * check_progress tasks*/ + goto unlock; + } else { + /* This would be a big problem*/ + /* Overwrite and hope for the best*/ + fd_ctx->opened_on_dst = (uint64_t)(uintptr_t)dst; + UNLOCK(&fd->lock); + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_VALUE, + NULL); - if (!frame) - return -1; + goto out; + } + } + ret = __dht_fd_ctx_set(this, fd, dst); + } +unlock: + UNLOCK(&fd->lock); +out: + return ret; +} - local = frame->local; +static dht_fd_ctx_t * +dht_fd_ctx_get(xlator_t *this, fd_t *fd) +{ + dht_fd_ctx_t *fd_ctx = NULL; + int ret = -1; + uint64_t tmp_val = 0; + + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + LOCK(&fd->lock); + { + ret = __fd_ctx_get(fd, this, &tmp_val); + if ((ret < 0) || (tmp_val == 0)) { + goto unlock; + } - LOCK (&frame->lock); - { - this_call_cnt = --local->call_cnt; + fd_ctx = (dht_fd_ctx_t *)(uintptr_t)tmp_val; + GF_REF_GET(fd_ctx); + } +unlock: + UNLOCK(&fd->lock); + +out: + return fd_ctx; +} + +gf_boolean_t +dht_fd_open_on_dst(xlator_t *this, fd_t *fd, xlator_t *dst) +{ + dht_fd_ctx_t *fd_ctx = NULL; + gf_boolean_t opened = _gf_false; + + fd_ctx = dht_fd_ctx_get(this, fd); + + if (fd_ctx) { + if (fd_ctx->opened_on_dst == (uint64_t)(uintptr_t)dst) { + opened = _gf_true; } - UNLOCK (&frame->lock); + GF_REF_PUT(fd_ctx); + } - return this_call_cnt; + return opened; } +void +dht_free_mig_info(void *data) +{ + dht_migrate_info_t *miginfo = NULL; -int -dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) + miginfo = data; + GF_FREE(miginfo); + + return; +} + +static int +dht_inode_ctx_set_mig_info(xlator_t *this, inode_t *inode, xlator_t *src_subvol, + xlator_t *dst_subvol) { - dht_conf_t *conf = NULL; - int cnt = 0; - int max = 0; - uint64_t y = 0; + dht_migrate_info_t *miginfo = NULL; + uint64_t value = 0; + int ret = -1; - if (x == ((uint64_t) -1)) { - y = (uint64_t) -1; - goto out; + miginfo = GF_CALLOC(1, sizeof(*miginfo), gf_dht_mt_miginfo_t); + if (miginfo == NULL) + goto out; + + miginfo->src_subvol = src_subvol; + miginfo->dst_subvol = dst_subvol; + GF_REF_INIT(miginfo, dht_free_mig_info); + + value = (uint64_t)(uintptr_t)miginfo; + + ret = inode_ctx_set1(inode, this, &value); + if (ret < 0) { + GF_REF_PUT(miginfo); + } + +out: + return ret; +} + +int +dht_inode_ctx_get_mig_info(xlator_t *this, inode_t *inode, + xlator_t **src_subvol, xlator_t **dst_subvol) +{ + int ret = -1; + uint64_t tmp_miginfo = 0; + dht_migrate_info_t *miginfo = NULL; + + LOCK(&inode->lock); + { + ret = __inode_ctx_get1(inode, this, &tmp_miginfo); + if ((ret < 0) || (tmp_miginfo == 0)) { + UNLOCK(&inode->lock); + goto out; } - conf = this->private; - if (!conf) - goto out; + miginfo = (dht_migrate_info_t *)(uintptr_t)tmp_miginfo; + GF_REF_GET(miginfo); + } + UNLOCK(&inode->lock); - max = conf->subvolume_cnt; - cnt = dht_subvol_cnt (this, subvol); + if (src_subvol) + *src_subvol = miginfo->src_subvol; - y = ((x * max) + cnt); + if (dst_subvol) + *dst_subvol = miginfo->dst_subvol; + + GF_REF_PUT(miginfo); out: - if (y_p) - *y_p = y; + return ret; +} - return 0; +gf_boolean_t +dht_mig_info_is_invalid(xlator_t *current, xlator_t *src_subvol, + xlator_t *dst_subvol) +{ + /* Not set + */ + if (!src_subvol || !dst_subvol) + return _gf_true; + + /* Invalid scenarios: + * The src_subvol does not match the subvol on which the current op was sent + * so the cached subvol has changed between the last mig_info_set and now. + * src_subvol == dst_subvol. The file was migrated without any FOP detecting + * a P2 so the old dst is now the current subvol. + * + * There is still one scenario where the info could be outdated - if + * file has undergone multiple migrations and ends up on the same src_subvol + * on which the mig_info was first set. + */ + if ((current == dst_subvol) || (current != src_subvol)) + return _gf_true; + + return _gf_false; } +/* Used to check if fd fops have the fd opened on the cached subvol + * This is required when: + * 1. an fd is opened on FILE1 on subvol1 + * 2. the file is migrated to subvol2 + * 3. a lookup updates the cached subvol in the inode_ctx to subvol2 + * 4. a write comes on the fd + * The write is sent to subvol2 on an fd which has been opened only on fd1 + * Since the migration phase checks don't kick in, the fop fails with EBADF + * + */ + int -dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, - xlator_t **subvol) +dht_check_and_open_fd_on_subvol_complete(int ret, call_frame_t *frame, + void *data) { - char *new_name = NULL; - char *new_path = NULL; - xlator_list_t *trav = NULL; - char key[1024] = {0,}; - int ret = 0; /* not found */ + glusterfs_fop_t fop = 0; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + fd_t *fd = NULL; + int op_errno = -1; + + local = frame->local; + this = frame->this; + fop = local->fop; + subvol = local->cached_subvol; + fd = local->fd; + + if (ret) { + op_errno = local->op_errno; + goto handle_err; + } + + switch (fop) { + case GF_FOP_WRITE: + STACK_WIND_COOKIE(frame, dht_writev_cbk, subvol, subvol, + subvol->fops->writev, fd, local->rebalance.vector, + local->rebalance.count, local->rebalance.offset, + local->rebalance.flags, local->rebalance.iobref, + local->xattr_req); + break; + + case GF_FOP_FLUSH: + STACK_WIND(frame, dht_flush_cbk, subvol, subvol->fops->flush, fd, + local->xattr_req); + break; + + case GF_FOP_FSETATTR: + STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol, + subvol->fops->fsetattr, fd, + &local->rebalance.stbuf, local->rebalance.flags, + local->xattr_req); + break; + + case GF_FOP_ZEROFILL: + STACK_WIND_COOKIE(frame, dht_zerofill_cbk, subvol, subvol, + subvol->fops->zerofill, fd, + local->rebalance.offset, local->rebalance.size, + local->xattr_req); + + break; + + case GF_FOP_DISCARD: + STACK_WIND_COOKIE(frame, dht_discard_cbk, subvol, subvol, + subvol->fops->discard, local->fd, + local->rebalance.offset, local->rebalance.size, + local->xattr_req); + break; + + case GF_FOP_FALLOCATE: + STACK_WIND_COOKIE(frame, dht_fallocate_cbk, subvol, subvol, + subvol->fops->fallocate, fd, + local->rebalance.flags, local->rebalance.offset, + local->rebalance.size, local->xattr_req); + break; + + case GF_FOP_FTRUNCATE: + STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol, + subvol->fops->ftruncate, fd, + local->rebalance.offset, local->xattr_req); + break; + + case GF_FOP_FSYNC: + STACK_WIND_COOKIE(frame, dht_fsync_cbk, subvol, subvol, + subvol->fops->fsync, local->fd, + local->rebalance.flags, local->xattr_req); + break; + + case GF_FOP_READ: + STACK_WIND(frame, dht_readv_cbk, subvol, subvol->fops->readv, + local->fd, local->rebalance.size, + local->rebalance.offset, local->rebalance.flags, + local->xattr_req); + break; + + case GF_FOP_FSTAT: + STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol, + subvol->fops->fstat, fd, local->xattr_req); + break; + + case GF_FOP_FSETXATTR: + STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol, + subvol->fops->fsetxattr, local->fd, + local->rebalance.xattr, local->rebalance.flags, + local->xattr_req); + break; + + case GF_FOP_FREMOVEXATTR: + STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol, + subvol->fops->fremovexattr, local->fd, local->key, + local->xattr_req); + + break; + + case GF_FOP_FXATTROP: + STACK_WIND(frame, dht_common_xattrop_cbk, subvol, + subvol->fops->fxattrop, local->fd, + local->rebalance.flags, local->rebalance.xattr, + local->xattr_req); + break; + + case GF_FOP_FGETXATTR: + STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->fgetxattr, + local->fd, local->key, NULL); + break; + + case GF_FOP_FINODELK: + STACK_WIND(frame, dht_finodelk_cbk, subvol, subvol->fops->finodelk, + local->key, local->fd, local->rebalance.lock_cmd, + &local->rebalance.flock, local->xattr_req); + break; + default: + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UNKNOWN_FOP, "fd=%p", + fd, "gfid=%s", uuid_utoa(fd->inode->gfid), "name=%s", + subvol->name, NULL); + break; + } + + goto out; + + /* Could not open the fd on the dst. Unwind */ + +handle_err: + + switch (fop) { + case GF_FOP_WRITE: + DHT_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_FLUSH: + DHT_STACK_UNWIND(flush, frame, -1, op_errno, NULL); + break; + + case GF_FOP_FSETATTR: + DHT_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_ZEROFILL: + DHT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_DISCARD: + DHT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_FALLOCATE: + DHT_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_FTRUNCATE: + DHT_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_FSYNC: + DHT_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_READ: + DHT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, + NULL); + break; + + case GF_FOP_FSTAT: + DHT_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL); + break; + + case GF_FOP_FSETXATTR: + DHT_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL); + break; + + case GF_FOP_FREMOVEXATTR: + DHT_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL); + break; + + case GF_FOP_FXATTROP: + DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL); + break; + + case GF_FOP_FGETXATTR: + DHT_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL); + break; + + case GF_FOP_FINODELK: + DHT_STACK_UNWIND(finodelk, frame, -1, op_errno, NULL); + break; + + default: + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UNKNOWN_FOP, "fd=%p", + fd, "gfid=%s", uuid_utoa(fd->inode->gfid), "name=%s", + subvol->name, NULL); + break; + } - /* Why do other tasks if first required 'char' itself is not there */ - if (loc->name && !strchr (loc->name, '@')) - goto out; +out: - trav = this->children; - while (trav) { - snprintf (key, 1024, "*@%s:%s", this->name, trav->xlator->name); - if (fnmatch (key, loc->name, FNM_NOESCAPE) == 0) { - new_name = GF_CALLOC(strlen (loc->name), - sizeof (char), - gf_common_mt_char); - if (!new_name) - goto out; - if (fnmatch (key, loc->path, FNM_NOESCAPE) == 0) { - new_path = GF_CALLOC(strlen (loc->path), - sizeof (char), - gf_common_mt_char); - if (!new_path) - goto out; - strncpy (new_path, loc->path, (strlen (loc->path) - - strlen (key) + 1)); - } - strncpy (new_name, loc->name, (strlen (loc->name) - - strlen (key) + 1)); - - if (new_loc) { - new_loc->path = ((new_path) ? new_path: - gf_strdup (loc->path)); - new_loc->name = new_name; - new_loc->ino = loc->ino; - new_loc->inode = inode_ref (loc->inode); - new_loc->parent = inode_ref (loc->parent); - } - *subvol = trav->xlator; - ret = 1; /* success */ - goto out; - } - trav = trav->next; + return 0; +} + +/* Check once again if the fd has been opened on the cached subvol. + * If not, open and update the fd_ctx. + */ + +int +dht_check_and_open_fd_on_subvol_task(void *data) +{ + loc_t loc = { + 0, + }; + int ret = -1; + call_frame_t *frame = NULL; + dht_local_t *local = NULL; + fd_t *fd = NULL; + xlator_t *this = NULL; + xlator_t *subvol = NULL; + + frame = data; + local = frame->local; + this = THIS; + fd = local->fd; + subvol = local->cached_subvol; + + local->fd_checked = _gf_true; + + if (fd_is_anonymous(fd) || dht_fd_open_on_dst(this, fd, subvol)) { + ret = 0; + goto out; + } + + gf_msg_debug(this->name, 0, "Opening fd (%p, flags=0%o) on file %s @ %s", + fd, fd->flags, uuid_utoa(fd->inode->gfid), subvol->name); + + loc.inode = inode_ref(fd->inode); + gf_uuid_copy(loc.gfid, fd->inode->gfid); + + /* Open this on the dst subvol */ + + SYNCTASK_SETID(0, 0); + + ret = syncop_open(subvol, &loc, (fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)), + fd, NULL, NULL); + + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_OPEN_FD_ON_DST_FAILED, + "fd=%p", fd, "flags=0%o", fd->flags, "gfid=%s", + uuid_utoa(fd->inode->gfid), "name=%s", subvol->name, NULL); + /* This can happen if the cached subvol was updated in the + * inode_ctx and the fd was opened on the new cached suvol + * after this fop was wound on the old cached subvol. + * As we do not close the fd on the old subvol (a leak) + * don't treat ENOENT as an error and allow the phase1/phase2 + * checks to handle it. + */ + + if ((-ret != ENOENT) && (-ret != ESTALE)) { + local->op_errno = -ret; + ret = -1; + } else { + ret = 0; } + + local->op_errno = -ret; + ret = -1; + + } else { + dht_fd_ctx_set(this, fd, subvol); + } + + SYNCTASK_SETID(frame->root->uid, frame->root->gid); out: - if (!ret) { - /* !success */ - if (new_path) - GF_FREE (new_path); - if (new_name) - GF_FREE (new_name); - } - return ret; + loc_wipe(&loc); + + return ret; } int -dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, - uint64_t *x_p) +dht_check_and_open_fd_on_subvol(xlator_t *this, call_frame_t *frame) { - dht_conf_t *conf = NULL; - int cnt = 0; - int max = 0; - uint64_t x = 0; - xlator_t *subvol = 0; + int ret = -1; + dht_local_t *local = NULL; + + /* + if (dht_fd_open_on_dst (this, fd, subvol)) + goto out; + */ + local = frame->local; + + ret = synctask_new(this->ctx->env, dht_check_and_open_fd_on_subvol_task, + dht_check_and_open_fd_on_subvol_complete, frame, frame); + + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SYNCTASK_CREATE_FAILED, + "to-check-and-open fd=%p", local->fd, NULL); + } + + return ret; +} + +int +dht_frame_return(call_frame_t *frame) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + + if (!frame) + return -1; - if (!this->private) + local = frame->local; + + LOCK(&frame->lock); + { + this_call_cnt = --local->call_cnt; + } + UNLOCK(&frame->lock); + + return this_call_cnt; +} + +/* + * Use this function to specify which subvol you want the file created + * on - this need not be the hashed subvol. + * Format: <filename>@<this->name>:<subvol-name> + * Eg: file-1@vol1-dht:vol1-client-0 + * where vol1 is a pure distribute volume + * will create file-1 on vol1-client-0 + */ + +int +dht_filter_loc_subvol_key(xlator_t *this, loc_t *loc, loc_t *new_loc, + xlator_t **subvol) +{ + char *new_name = NULL; + char *new_path = NULL; + xlator_list_t *trav = NULL; + char key[1024] = { + 0, + }; + int ret = 0; /* not found */ + int keylen = 0; + int name_len = 0; + int path_len = 0; + + /* Why do other tasks if first required 'char' itself is not there */ + if (!new_loc || !loc || !loc->name || !strchr(loc->name, '@')) { + /* Skip the GF_FREE checks here */ + return ret; + } + + trav = this->children; + while (trav) { + keylen = snprintf(key, sizeof(key), "*@%s:%s", this->name, + trav->xlator->name); + /* Ignore '*' */ + keylen = keylen - 1; + if (fnmatch(key, loc->name, FNM_NOESCAPE) == 0) { + name_len = strlen(loc->name) - keylen; + new_name = GF_MALLOC(name_len + 1, gf_common_mt_char); + if (!new_name) goto out; + if (fnmatch(key, loc->path, FNM_NOESCAPE) == 0) { + path_len = strlen(loc->path) - keylen; + new_path = GF_MALLOC(path_len + 1, gf_common_mt_char); + if (!new_path) + goto out; + snprintf(new_path, path_len + 1, "%s", loc->path); + } + snprintf(new_name, name_len + 1, "%s", loc->name); + + if (new_loc) { + new_loc->path = ((new_path) ? new_path : gf_strdup(loc->path)); + new_loc->name = new_name; + new_loc->inode = inode_ref(loc->inode); + new_loc->parent = inode_ref(loc->parent); + } + *subvol = trav->xlator; + ret = 1; /* success */ + goto out; + } + trav = trav->next; + } +out: + if (!ret) { + /* !success */ + GF_FREE(new_path); + GF_FREE(new_name); + } + return ret; +} - conf = this->private; - max = conf->subvolume_cnt; +static xlator_t * +dht_get_subvol_from_id(xlator_t *this, int client_id) +{ + xlator_t *xl = NULL; + dht_conf_t *conf = NULL; + char *sid = NULL; + int32_t ret = -1; - cnt = y % max; - x = y / max; + conf = this->private; - subvol = conf->subvolumes[cnt]; + ret = gf_asprintf(&sid, "%d", client_id); + if (ret == -1) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_ASPRINTF_FAILED, NULL); + goto out; + } - if (subvol_p) - *subvol_p = subvol; + if (dict_get_ptr(conf->leaf_to_subvol, sid, (void **)&xl)) + xl = NULL; - if (x_p) - *x_p = x; + GF_FREE(sid); out: - return 0; + return xl; } +int +dht_deitransform(xlator_t *this, uint64_t y, xlator_t **subvol_p) +{ + int client_id = 0; + xlator_t *subvol = 0; + dht_conf_t *conf = NULL; + + if (!this->private) + return -1; + + conf = this->private; + + client_id = gf_deitransform(this, y); + + subvol = dht_get_subvol_from_id(this, client_id); + + if (!subvol) + subvol = conf->subvolumes[0]; + + if (subvol_p) + *subvol_p = subvol; + + return 0; +} void -dht_local_wipe (xlator_t *this, dht_local_t *local) +dht_local_wipe(xlator_t *this, dht_local_t *local) { - if (!local) - return; + int i = 0; - loc_wipe (&local->loc); - loc_wipe (&local->loc2); + if (!local) + return; - if (local->xattr) - dict_unref (local->xattr); + loc_wipe(&local->loc); + loc_wipe(&local->loc2); + loc_wipe(&local->loc2_copy); - if (local->inode) - inode_unref (local->inode); + if (local->xattr) + dict_unref(local->xattr); - if (local->layout) { - dht_layout_unref (this, local->layout); - local->layout = NULL; - } + if (local->inode) + inode_unref(local->inode); - loc_wipe (&local->linkfile.loc); + if (local->layout) { + dht_layout_unref(this, local->layout); + local->layout = NULL; + } - if (local->linkfile.xattr) - dict_unref (local->linkfile.xattr); + loc_wipe(&local->linkfile.loc); - if (local->linkfile.inode) - inode_unref (local->linkfile.inode); + if (local->linkfile.xattr) + dict_unref(local->linkfile.xattr); - if (local->fd) { - fd_unref (local->fd); - local->fd = NULL; - } + if (local->linkfile.inode) + inode_unref(local->linkfile.inode); - if (local->params) { - dict_unref (local->params); - local->params = NULL; - } + if (local->fd) { + fd_unref(local->fd); + local->fd = NULL; + } - if (local->xattr_req) - dict_unref (local->xattr_req); + if (local->params) { + dict_unref(local->params); + local->params = NULL; + } - if (local->selfheal.layout) { - dht_layout_unref (this, local->selfheal.layout); - local->selfheal.layout = NULL; - } + if (local->xattr_req) + dict_unref(local->xattr_req); + if (local->mds_xattr) + dict_unref(local->mds_xattr); + if (local->xdata) + dict_unref(local->xdata); - if (local->newpath) { - GF_FREE (local->newpath); - } + if (local->selfheal.layout) { + dht_layout_unref(this, local->selfheal.layout); + local->selfheal.layout = NULL; + } - if (local->key) { - GF_FREE (local->key); - } + if (local->selfheal.refreshed_layout) { + dht_layout_unref(this, local->selfheal.refreshed_layout); + local->selfheal.refreshed_layout = NULL; + } - GF_FREE (local); -} + for (i = 0; i < 2; i++) { + dht_lock_array_free(local->lock[i].ns.parent_layout.locks, + local->lock[i].ns.parent_layout.lk_count); + GF_FREE(local->lock[i].ns.parent_layout.locks); -dht_local_t * -dht_local_init (call_frame_t *frame) -{ - dht_local_t *local = NULL; + dht_lock_array_free(local->lock[i].ns.directory_ns.locks, + local->lock[i].ns.directory_ns.lk_count); + GF_FREE(local->lock[i].ns.directory_ns.locks); + } - /* TODO: use mem-pool */ - local = GF_CALLOC (1, sizeof (*local), - gf_dht_mt_dht_local_t); + GF_FREE(local->key); - if (!local) - return NULL; + if (local->rebalance.xdata) + dict_unref(local->rebalance.xdata); - local->op_ret = -1; - local->op_errno = EUCLEAN; + if (local->rebalance.xattr) + dict_unref(local->rebalance.xattr); - frame->local = local; + if (local->rebalance.dict) + dict_unref(local->rebalance.dict); - return local; -} + GF_FREE(local->rebalance.vector); + if (local->rebalance.iobref) + iobref_unref(local->rebalance.iobref); -char * -basestr (const char *str) -{ - char *basestr = NULL; + if (local->stub) { + call_stub_destroy(local->stub); + local->stub = NULL; + } - basestr = strrchr (str, '/'); - if (basestr) - basestr ++; + if (local->ret_cache) + GF_FREE(local->ret_cache); - return basestr; + mem_put(local); } +dht_local_t * +dht_local_init(call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop) +{ + dht_local_t *local = NULL; + inode_t *inode = NULL; + int ret = 0; + + local = mem_get0(THIS->local_pool); + if (!local) + goto out; + + if (loc) { + ret = loc_copy(&local->loc, loc); + if (ret) + goto out; + + inode = loc->inode; + } + + if (fd) { + local->fd = fd_ref(fd); + if (!inode) + inode = fd->inode; + } + + local->op_ret = -1; + local->op_errno = EUCLEAN; + local->fop = fop; + + if (inode) { + local->layout = dht_layout_get(frame->this, inode); + local->cached_subvol = dht_subvol_get_cached(frame->this, inode); + } + + frame->local = local; + +out: + if (ret) { + if (local) + mem_put(local); + local = NULL; + } + return local; +} xlator_t * -dht_first_up_subvol (xlator_t *this) +dht_first_up_subvol(xlator_t *this) { - dht_conf_t *conf = NULL; - xlator_t *child = NULL; - int i = 0; + dht_conf_t *conf = NULL; + xlator_t *child = NULL; + int i = 0; + time_t time = 0; - conf = this->private; - if (!conf) - goto out; + conf = this->private; + if (!conf) + goto out; - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolume_status[i]) { - child = conf->subvolumes[i]; - break; - } + LOCK(&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvol_up_time[i]) { + if (!time) { + time = conf->subvol_up_time[i]; + child = conf->subvolumes[i]; + } else if (time > conf->subvol_up_time[i]) { + time = conf->subvol_up_time[i]; + child = conf->subvolumes[i]; } + } } - UNLOCK (&conf->subvolume_lock); + } + UNLOCK(&conf->subvolume_lock); out: - return child; + return child; } xlator_t * -dht_last_up_subvol (xlator_t *this) +dht_last_up_subvol(xlator_t *this) { - dht_conf_t *conf = NULL; - xlator_t *child = NULL; - int i = 0; + dht_conf_t *conf = NULL; + xlator_t *child = NULL; + int i = 0; + + conf = this->private; + if (!conf) + goto out; + + LOCK(&conf->subvolume_lock); + { + for (i = conf->subvolume_cnt - 1; i >= 0; i--) { + if (conf->subvolume_status[i]) { + child = conf->subvolumes[i]; + break; + } + } + } + UNLOCK(&conf->subvolume_lock); - conf = this->private; - if (!conf) - goto out; +out: + return child; +} - LOCK (&conf->subvolume_lock); - { - for (i = conf->subvolume_cnt-1; i >= 0; i--) { - if (conf->subvolume_status[i]) { - child = conf->subvolumes[i]; - break; - } - } - } - UNLOCK (&conf->subvolume_lock); +xlator_t * +dht_subvol_get_hashed(xlator_t *this, loc_t *loc) +{ + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; + + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); + + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, out); + + methods = &(conf->methods); + + if (__is_root_gfid(loc->gfid)) { + subvol = dht_first_up_subvol(this); + goto out; + } + + GF_VALIDATE_OR_GOTO(this->name, loc->parent, out); + GF_VALIDATE_OR_GOTO(this->name, loc->name, out); + + layout = dht_layout_get(this, loc->parent); + + if (!layout) { + gf_msg_debug(this->name, 0, "Missing layout. path=%s, parent gfid =%s", + loc->path, uuid_utoa(loc->parent->gfid)); + goto out; + } + + subvol = methods->layout_search(this, layout, loc->name); + + if (!subvol) { + gf_msg_debug(this->name, 0, "No hashed subvolume for path=%s", + loc->path); + goto out; + } out: - return child; + if (layout) { + dht_layout_unref(this, layout); + } + + return subvol; } xlator_t * -dht_subvol_get_hashed (xlator_t *this, loc_t *loc) +dht_subvol_get_cached(xlator_t *this, inode_t *inode) { - dht_layout_t *layout = NULL; - xlator_t *subvol = NULL; + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; - if (is_fs_root (loc)) { - subvol = dht_first_up_subvol (this); - goto out; - } + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); - layout = dht_layout_get (this, loc->parent); + layout = dht_layout_get(this, inode); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "layout missing path=%s parent=%"PRId64, - loc->path, loc->parent->ino); - goto out; + if (!layout) { + goto out; + } + + subvol = layout->list[0].xlator; + +out: + if (layout) { + dht_layout_unref(this, layout); + } + + return subvol; +} + +xlator_t * +dht_subvol_next(xlator_t *this, xlator_t *prev) +{ + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; + + conf = this->private; + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + if ((i + 1) < conf->subvolume_cnt) + next = conf->subvolumes[i + 1]; + break; } + } - subvol = dht_layout_search (this, layout, loc->name); +out: + return next; +} - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "could not find subvolume for path=%s", - loc->path); - goto out; +/* This func wraps around, if prev is actually the last subvol. + */ +xlator_t * +dht_subvol_next_available(xlator_t *this, xlator_t *prev) +{ + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; + + conf = this->private; + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + /* if prev is last in conf->subvolumes, then wrap + * around. + */ + if ((i + 1) < conf->subvolume_cnt) { + next = conf->subvolumes[i + 1]; + } else { + next = conf->subvolumes[0]; + } + break; } + } out: - if (layout) { - dht_layout_unref (this, layout); + return next; +} +int +dht_subvol_cnt(xlator_t *this, xlator_t *subvol) +{ + int i = 0; + int ret = -1; + dht_conf_t *conf = NULL; + + conf = this->private; + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + ret = i; + break; } + } - return subvol; +out: + return ret; } +#define set_if_greater(a, b) \ + do { \ + if ((a) < (b)) \ + (a) = (b); \ + } while (0) -xlator_t * -dht_subvol_get_cached (xlator_t *this, inode_t *inode) +#define set_if_greater_time(a, an, b, bn) \ + do { \ + if (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))) { \ + (a) = (b); \ + (an) = (bn); \ + } \ + } while (0) + +int +dht_iatt_merge(xlator_t *this, struct iatt *to, struct iatt *from) { - dht_layout_t *layout = NULL; - xlator_t *subvol = NULL; + if (!from || !to) + return 0; + to->ia_dev = from->ia_dev; + + gf_uuid_copy(to->ia_gfid, from->ia_gfid); + + to->ia_ino = from->ia_ino; + to->ia_prot = from->ia_prot; + to->ia_type = from->ia_type; + to->ia_nlink = from->ia_nlink; + to->ia_rdev = from->ia_rdev; + to->ia_size += from->ia_size; + to->ia_blksize = from->ia_blksize; + to->ia_blocks += from->ia_blocks; + + if (IA_ISDIR(from->ia_type)) { + to->ia_blocks = DHT_DIR_STAT_BLOCKS; + to->ia_size = DHT_DIR_STAT_SIZE; + } + set_if_greater(to->ia_uid, from->ia_uid); + set_if_greater(to->ia_gid, from->ia_gid); + + set_if_greater_time(to->ia_atime, to->ia_atime_nsec, from->ia_atime, + from->ia_atime_nsec); + set_if_greater_time(to->ia_mtime, to->ia_mtime_nsec, from->ia_mtime, + from->ia_mtime_nsec); + set_if_greater_time(to->ia_ctime, to->ia_ctime_nsec, from->ia_ctime, + from->ia_ctime_nsec); + + return 0; +} - layout = dht_layout_get (this, inode); +int +dht_build_child_loc(xlator_t *this, loc_t *child, loc_t *parent, char *name) +{ + if (!child) { + goto err; + } - if (!layout) { - goto out; - } + if (strcmp(parent->path, "/") == 0) + gf_asprintf((char **)&child->path, "/%s", name); + else + gf_asprintf((char **)&child->path, "%s/%s", parent->path, name); + + if (!child->path) { + goto err; + } + + child->name = strrchr(child->path, '/'); + if (child->name) + child->name++; + + child->parent = inode_ref(parent->inode); + child->inode = inode_new(parent->inode->table); + + if (!child->inode) { + goto err; + } + + return 0; +err: + if (child) { + loc_wipe(child); + } + return -1; +} + +int +dht_init_local_subvolumes(xlator_t *this, dht_conf_t *conf) +{ + xlator_list_t *subvols = NULL; + int cnt = 0; + + if (!conf) + return -1; - subvol = layout->list[0].xlator; + for (subvols = this->children; subvols; subvols = subvols->next) + cnt++; + + conf->local_subvols = GF_CALLOC(cnt, sizeof(xlator_t *), + gf_dht_mt_xlator_t); + + /* FIX FIX : do this dynamically*/ + conf->local_nodeuuids = GF_CALLOC(cnt, sizeof(subvol_nodeuuids_info_t), + gf_dht_nodeuuids_t); + + if (!conf->local_subvols || !conf->local_nodeuuids) { + return -1; + } + + conf->local_subvols_cnt = 0; + + return 0; +} + +int +dht_init_subvolumes(xlator_t *this, dht_conf_t *conf) +{ + xlator_list_t *subvols = NULL; + int cnt = 0; + + if (!conf) + return -1; + + for (subvols = this->children; subvols; subvols = subvols->next) + cnt++; + + conf->subvolumes = GF_CALLOC(cnt, sizeof(xlator_t *), gf_dht_mt_xlator_t); + if (!conf->subvolumes) { + return -1; + } + conf->subvolume_cnt = cnt; + /* Doesn't make sense to do any dht layer tasks + if the subvol count is 1. Set it as pass_through */ + if (cnt == 1) + this->pass_through = _gf_true; + + conf->local_subvols_cnt = 0; + + dht_set_subvol_range(this); + + cnt = 0; + for (subvols = this->children; subvols; subvols = subvols->next) + conf->subvolumes[cnt++] = subvols->xlator; + + conf->subvolume_status = GF_CALLOC(cnt, sizeof(char), gf_dht_mt_char); + if (!conf->subvolume_status) { + return -1; + } + + conf->last_event = GF_CALLOC(cnt, sizeof(int), gf_dht_mt_char); + if (!conf->last_event) { + return -1; + } + + conf->subvol_up_time = GF_CALLOC(cnt, sizeof(time_t), + gf_dht_mt_subvol_time); + if (!conf->subvol_up_time) { + return -1; + } + + conf->du_stats = GF_CALLOC(conf->subvolume_cnt, sizeof(dht_du_t), + gf_dht_mt_dht_du_t); + if (!conf->du_stats) { + return -1; + } + + conf->decommissioned_bricks = GF_CALLOC(cnt, sizeof(xlator_t *), + gf_dht_mt_xlator_t); + if (!conf->decommissioned_bricks) { + return -1; + } + + return 0; +} + +/* + op_ret values : + 0 : Success. + -1 : Failure. + 1 : File is being migrated but not by this DHT layer. +*/ + +static int +dht_migration_complete_check_done(int op_ret, call_frame_t *frame, void *data) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (local->cached_subvol == NULL) { + local->op_errno = EINVAL; + goto out; + } + + subvol = local->cached_subvol; out: - if (layout) { - dht_layout_unref (this, layout); + local->rebalance.target_op_fn(THIS, subvol, frame, op_ret); + + return 0; +} + +int +dht_migration_complete_check_task(void *data) +{ + int ret = -1; + xlator_t *src_node = NULL; + xlator_t *dst_node = NULL, *linkto_target = NULL; + dht_local_t *local = NULL; + dict_t *dict = NULL; + struct iatt stbuf = { + 0, + }; + xlator_t *this = NULL; + call_frame_t *frame = NULL; + loc_t tmp_loc = { + 0, + }; + char *path = NULL; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + fd_t *tmp = NULL; + uint64_t tmp_miginfo = 0; + dht_migrate_info_t *miginfo = NULL; + gf_boolean_t skip_open = _gf_false; + int open_failed = 0; + + this = THIS; + frame = data; + local = frame->local; + conf = this->private; + + src_node = local->cached_subvol; + + if (!local->loc.inode && !local->fd) { + local->op_errno = EINVAL; + goto out; + } + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check won't be done*/ + + if (!local->loc.inode) { + ret = syncop_fgetxattr(src_node, local->fd, &dict, + conf->link_xattr_name, NULL, NULL); + } else { + SYNCTASK_SETID(0, 0); + ret = syncop_getxattr(src_node, &local->loc, &dict, + conf->link_xattr_name, NULL, NULL); + SYNCTASK_SETID(frame->root->uid, frame->root->gid); + } + + /* + * Each DHT xlator layer has its own name for the linkto xattr. + * If the file mode bits indicate the the file is being migrated but + * this layer's linkto xattr is not set, it means that another + * DHT layer is migrating the file. In this case, return 1 so + * the mode bits can be passed on to the higher layer for appropriate + * action. + */ + if (-ret == ENODATA) { + /* This DHT translator is not migrating this file */ + + ret = inode_ctx_reset1(inode, this, &tmp_miginfo); + if (tmp_miginfo) { + /* This can be a problem if the file was + * migrated by two different layers. Raise + * a warning here. + */ + gf_smsg( + this->name, GF_LOG_WARNING, 0, DHT_MSG_HAS_MIGINFO, "tmp=%s", + tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), NULL); + + miginfo = (void *)(uintptr_t)tmp_miginfo; + GF_REF_PUT(miginfo); + } + ret = 1; + goto out; + } + + if (!ret) + linkto_target = dht_linkfile_subvol(this, NULL, NULL, dict); + + if (local->loc.inode) { + loc_copy(&tmp_loc, &local->loc); + } else { + tmp_loc.inode = inode_ref(inode); + gf_uuid_copy(tmp_loc.gfid, inode->gfid); + } + + ret = syncop_lookup(this, &tmp_loc, &stbuf, 0, 0, 0); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_FILE_LOOKUP_FAILED, + "tmp=%s", tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), + "name=%s", this->name, NULL); + local->op_errno = -ret; + ret = -1; + goto out; + } + + dst_node = dht_subvol_get_cached(this, tmp_loc.inode); + if (linkto_target && dst_node != linkto_target) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_LINKFILE, + "linkto_target_name=%s", linkto_target->name, "dst_name=%s", + dst_node->name, NULL); + } + + if (gf_uuid_compare(stbuf.ia_gfid, tmp_loc.inode->gfid)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, "tmp=%s", + tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), + "dst_name=%s", dst_node->name, NULL); + ret = -1; + local->op_errno = EIO; + goto out; + } + + /* update local. A layout is set in inode-ctx in lookup already */ + + dht_layout_unref(this, local->layout); + + local->layout = dht_layout_get(frame->this, inode); + local->cached_subvol = dst_node; + + ret = 0; + + /* once we detect the migration complete, the inode-ctx2 is no more + required.. delete the ctx and also, it means, open() already + done on all the fd of inode */ + ret = inode_ctx_reset1(inode, this, &tmp_miginfo); + if (tmp_miginfo) { + miginfo = (void *)(uintptr_t)tmp_miginfo; + GF_REF_PUT(miginfo); + goto out; + } + + /* perform 'open()' on all the fd's present on the inode */ + if (tmp_loc.path == NULL) { + inode_path(inode, NULL, &path); + if (path) + tmp_loc.path = path; + } + + LOCK(&inode->lock); + + if (list_empty(&inode->fd_list)) + goto unlock; + + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID(0, 0); + + /* It's possible that we are the last user of iter_fd after each + * iteration. In this case the fd_unref() of iter_fd at the end of + * the loop will cause the destruction of the fd. So we need to + * iterate the list safely because iter_fd cannot be trusted. + */ + iter_fd = list_entry((&inode->fd_list)->next, typeof(*iter_fd), inode_list); + while (&iter_fd->inode_list != (&inode->fd_list)) { + if (fd_is_anonymous(iter_fd) || + (dht_fd_open_on_dst(this, iter_fd, dst_node))) { + if (!tmp) { + iter_fd = list_entry(iter_fd->inode_list.next, typeof(*iter_fd), + inode_list); + continue; + } + skip_open = _gf_true; } + /* We need to release the inode->lock before calling + * syncop_open() to avoid possible deadlocks. However this + * can cause the iter_fd to be released by other threads. + * To avoid this, we take a reference before releasing the + * lock. + */ + fd_ref(iter_fd); + + UNLOCK(&inode->lock); + + if (tmp) { + fd_unref(tmp); + tmp = NULL; + } + if (skip_open) + goto next; + + /* flags for open are stripped down to allow following the + * new location of the file, otherwise we can get EEXIST or + * truncate the file again as rebalance is moving the data */ + ret = syncop_open(dst_node, &tmp_loc, + (iter_fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)), + iter_fd, NULL, NULL); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_OPEN_FD_ON_DST_FAILED, "id=%p", iter_fd, + "flags=0%o", iter_fd->flags, "path=%s", path, "name=%s", + dst_node->name, NULL); + + open_failed = 1; + local->op_errno = -ret; + ret = -1; + } else { + dht_fd_ctx_set(this, iter_fd, dst_node); + } + + next: + LOCK(&inode->lock); + skip_open = _gf_false; + tmp = iter_fd; + iter_fd = list_entry(tmp->inode_list.next, typeof(*tmp), inode_list); + } + + SYNCTASK_SETID(frame->root->uid, frame->root->gid); + + if (open_failed) { + ret = -1; + goto unlock; + } + ret = 0; - return subvol; +unlock: + UNLOCK(&inode->lock); + if (tmp) { + fd_unref(tmp); + tmp = NULL; + } + +out: + if (dict) { + dict_unref(dict); + } + + loc_wipe(&tmp_loc); + + return ret; } +int +dht_rebalance_complete_check(xlator_t *this, call_frame_t *frame) +{ + int ret = -1; -xlator_t * -dht_subvol_next (xlator_t *this, xlator_t *prev) + ret = synctask_new(this->ctx->env, dht_migration_complete_check_task, + dht_migration_complete_check_done, frame, frame); + return ret; +} + +/* During 'in-progress' state, both nodes should have the file */ +/* + op_ret values : + 0 : Success + -1 : Failure. + 1 : File is being migrated but not by this DHT layer. +*/ +static int +dht_inprogress_check_done(int op_ret, call_frame_t *frame, void *data) { - dht_conf_t *conf = NULL; - int i = 0; - xlator_t *next = NULL; + dht_local_t *local = NULL; + xlator_t *dst_subvol = NULL, *src_subvol = NULL; + inode_t *inode = NULL; - conf = this->private; - if (!conf) - goto out; + local = frame->local; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolumes[i] == prev) { - if ((i + 1) < conf->subvolume_cnt) - next = conf->subvolumes[i + 1]; - break; - } + if (op_ret != 0) + goto out; + + inode = local->loc.inode ? local->loc.inode : local->fd->inode; + + dht_inode_ctx_get_mig_info(THIS, inode, &src_subvol, &dst_subvol); + if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol, dst_subvol)) { + dst_subvol = dht_subvol_get_cached(THIS, inode); + if (!dst_subvol) { + local->op_errno = EINVAL; + goto out; } + } out: - return next; + local->rebalance.target_op_fn(THIS, dst_subvol, frame, op_ret); + + return 0; } +static int +dht_rebalance_inprogress_task(void *data) +{ + int ret = -1; + xlator_t *src_node = NULL; + xlator_t *dst_node = NULL; + dht_local_t *local = NULL; + dict_t *dict = NULL; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + char *path = NULL; + struct iatt stbuf = { + 0, + }; + loc_t tmp_loc = { + 0, + }; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + fd_t *tmp = NULL; + int open_failed = 0; + uint64_t tmp_miginfo = 0; + dht_migrate_info_t *miginfo = NULL; + gf_boolean_t skip_open = _gf_false; + + this = THIS; + frame = data; + local = frame->local; + conf = this->private; + + src_node = local->cached_subvol; + + if (!local->loc.inode && !local->fd) + goto out; + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check won't be done*/ + if (local->loc.inode) { + SYNCTASK_SETID(0, 0); + ret = syncop_getxattr(src_node, &local->loc, &dict, + conf->link_xattr_name, NULL, NULL); + SYNCTASK_SETID(frame->root->uid, frame->root->gid); + } else { + ret = syncop_fgetxattr(src_node, local->fd, &dict, + conf->link_xattr_name, NULL, NULL); + } + + /* + * Each DHT xlator layer has its own name for the linkto xattr. + * If the file mode bits indicate the the file is being migrated but + * this layer's linkto xattr is not present, it means that another + * DHT layer is migrating the file. In this case, return 1 so + * the mode bits can be passed on to the higher layer for appropriate + * action. + */ + + if (-ret == ENODATA) { + /* This DHT layer is not migrating this file */ + ret = inode_ctx_reset1(inode, this, &tmp_miginfo); + if (tmp_miginfo) { + /* This can be a problem if the file was + * migrated by two different layers. Raise + * a warning here. + */ + gf_smsg( + this->name, GF_LOG_WARNING, 0, DHT_MSG_HAS_MIGINFO, "tmp=%s", + tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), NULL); + miginfo = (void *)(uintptr_t)tmp_miginfo; + GF_REF_PUT(miginfo); + } + ret = 1; + goto out; + } + + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_GET_XATTR_FAILED, + "path=%s", local->loc.path, NULL); + ret = -1; + goto out; + } + + dst_node = dht_linkfile_subvol(this, NULL, NULL, dict); + if (!dst_node) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GET_XATTR_FAILED, + "path=%s", local->loc.path, NULL); + ret = -1; + goto out; + } + + local->rebalance.target_node = dst_node; + + if (local->loc.inode) { + loc_copy(&tmp_loc, &local->loc); + } else { + tmp_loc.inode = inode_ref(inode); + gf_uuid_copy(tmp_loc.gfid, inode->gfid); + } + + /* lookup on dst */ + ret = syncop_lookup(dst_node, &tmp_loc, &stbuf, NULL, NULL, NULL); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_FILE_LOOKUP_FAILED, + "tmp=%s", tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), + "name=%s", dst_node->name, NULL); + ret = -1; + goto out; + } + + if (gf_uuid_compare(stbuf.ia_gfid, tmp_loc.inode->gfid)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, "tmp=%s", + tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), + "name=%s", dst_node->name, NULL); + ret = -1; + goto out; + } + ret = 0; + + if (tmp_loc.path == NULL) { + inode_path(inode, NULL, &path); + if (path) + tmp_loc.path = path; + } + + LOCK(&inode->lock); + + if (list_empty(&inode->fd_list)) + goto unlock; + + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID(0, 0); + + /* It's possible that we are the last user of iter_fd after each + * iteration. In this case the fd_unref() of iter_fd at the end of + * the loop will cause the destruction of the fd. So we need to + * iterate the list safely because iter_fd cannot be trusted. + */ + iter_fd = list_entry((&inode->fd_list)->next, typeof(*iter_fd), inode_list); + while (&iter_fd->inode_list != (&inode->fd_list)) { + /* We need to release the inode->lock before calling + * syncop_open() to avoid possible deadlocks. However this + * can cause the iter_fd to be released by other threads. + * To avoid this, we take a reference before releasing the + * lock. + */ + + if (fd_is_anonymous(iter_fd) || + (dht_fd_open_on_dst(this, iter_fd, dst_node))) { + if (!tmp) { + iter_fd = list_entry(iter_fd->inode_list.next, typeof(*iter_fd), + inode_list); + continue; + } + skip_open = _gf_true; + } + + /* Yes, this is ugly but there isn't a cleaner way to do this + * the fd_ref is an atomic increment so not too bad. We want to + * reduce the number of inode locks and unlocks. + */ + + fd_ref(iter_fd); + UNLOCK(&inode->lock); + + if (tmp) { + fd_unref(tmp); + tmp = NULL; + } + if (skip_open) + goto next; + + /* flags for open are stripped down to allow following the + * new location of the file, otherwise we can get EEXIST or + * truncate the file again as rebalance is moving the data */ + ret = syncop_open(dst_node, &tmp_loc, + (iter_fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)), + iter_fd, NULL, NULL); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_OPEN_FD_ON_DST_FAILED, "fd=%p", iter_fd, + "flags=0%o", iter_fd->flags, "path=%s", path, "name=%s", + dst_node->name, NULL); + ret = -1; + open_failed = 1; + } else { + /* Potential fd leak if this fails here as it will be + reopened at the next Phase1/2 check */ + dht_fd_ctx_set(this, iter_fd, dst_node); + } + + next: + LOCK(&inode->lock); + skip_open = _gf_false; + tmp = iter_fd; + iter_fd = list_entry(tmp->inode_list.next, typeof(*tmp), inode_list); + } + + SYNCTASK_SETID(frame->root->uid, frame->root->gid); + +unlock: + UNLOCK(&inode->lock); + + if (tmp) { + fd_unref(tmp); + tmp = NULL; + } + if (open_failed) { + ret = -1; + goto out; + } + + ret = dht_inode_ctx_set_mig_info(this, inode, src_node, dst_node); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "path=%s", local->loc.path, "name=%s", dst_node->name, NULL); + goto out; + } + + ret = 0; +out: + if (dict) { + dict_unref(dict); + } + + loc_wipe(&tmp_loc); + return ret; +} int -dht_subvol_cnt (xlator_t *this, xlator_t *subvol) +dht_rebalance_in_progress_check(xlator_t *this, call_frame_t *frame) { - int i = 0; - int ret = -1; - dht_conf_t *conf = NULL; + int ret = -1; - conf = this->private; - if (!conf) - goto out; + ret = synctask_new(this->ctx->env, dht_rebalance_inprogress_task, + dht_inprogress_check_done, frame, frame); + return ret; +} - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - ret = i; - break; - } - } +int +dht_inode_ctx_layout_set(inode_t *inode, xlator_t *this, + dht_layout_t *layout_int) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + + ret = dht_inode_ctx_get(inode, this, &ctx); + if (!ret && ctx) { + ctx->layout = layout_int; + } else { + ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + return ret; + ctx->layout = layout_int; + } + + ret = dht_inode_ctx_set(inode, this, ctx); + + return ret; +} + +void +dht_inode_ctx_time_set(inode_t *inode, xlator_t *this, struct iatt *stat) +{ + dht_inode_ctx_t *ctx = NULL; + dht_stat_time_t *time = 0; + int ret = -1; + + ret = dht_inode_ctx_get(inode, this, &ctx); + + if (ret) + return; + time = &ctx->time; + + time->mtime = stat->ia_mtime; + time->mtime_nsec = stat->ia_mtime_nsec; + + time->ctime = stat->ia_ctime; + time->ctime_nsec = stat->ia_ctime_nsec; + + time->atime = stat->ia_atime; + time->atime_nsec = stat->ia_atime_nsec; + + return; +} + +int +dht_inode_ctx_time_update(inode_t *inode, xlator_t *this, struct iatt *stat, + int32_t post) +{ + dht_inode_ctx_t *ctx = NULL; + dht_stat_time_t *time = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO(this->name, stat, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + ret = dht_inode_ctx_get(inode, this, &ctx); + + if (ret) { + ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + return -1; + } + + time = &ctx->time; + + LOCK(&inode->lock); + { + DHT_UPDATE_TIME(time->mtime, time->mtime_nsec, stat->ia_mtime, + stat->ia_mtime_nsec, post); + DHT_UPDATE_TIME(time->ctime, time->ctime_nsec, stat->ia_ctime, + stat->ia_ctime_nsec, post); + DHT_UPDATE_TIME(time->atime, time->atime_nsec, stat->ia_atime, + stat->ia_atime_nsec, post); + } + UNLOCK(&inode->lock); + + ret = dht_inode_ctx_set(inode, this, ctx); out: + return 0; +} + +int +dht_inode_ctx_get(inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + ret = inode_ctx_get(inode, this, &ctx_int); + + if (ret) return ret; + + if (ctx) + *ctx = (dht_inode_ctx_t *)(uintptr_t)ctx_int; +out: + return ret; } +int +dht_inode_ctx_set(inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; -#define set_if_greater(a, b) do { \ - if ((a) < (b)) \ - (a) = (b); \ - } while (0) + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, ctx, out); + + ctx_int = (long)ctx; + ret = inode_ctx_set(inode, this, &ctx_int); +out: + return ret; +} int -dht_iatt_merge (xlator_t *this, struct iatt *to, - struct iatt *from, xlator_t *subvol) +dht_subvol_status(dht_conf_t *conf, xlator_t *subvol) { - if (!from || !to) - return 0; + int i; - to->ia_dev = from->ia_dev; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == subvol) { + return conf->subvolume_status[i]; + } + } + return 0; +} - uuid_copy (to->ia_gfid, from->ia_gfid); +inode_t * +dht_heal_path(xlator_t *this, char *path, inode_table_t *itable) +{ + int ret = -1; + struct iatt iatt = { + 0, + }; + inode_t *linked_inode = NULL; + loc_t loc = { + 0, + }; + char *bname = NULL; + char *save_ptr = NULL; + static uuid_t gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + char *tmp_path = NULL; + + tmp_path = gf_strdup(path); + if (!tmp_path) { + goto out; + } + + gf_uuid_copy(loc.pargfid, gfid); + loc.parent = inode_ref(itable->root); + + bname = strtok_r(tmp_path, "/", &save_ptr); + + /* sending a lookup on parent directory, + * Eg: if path is like /a/b/c/d/e/f/g/ + * then we will send a lookup on a first and then b,c,d,etc + */ + + while (bname) { + linked_inode = NULL; + loc.inode = inode_grep(itable, loc.parent, bname); + if (loc.inode == NULL) { + loc.inode = inode_new(itable); + if (loc.inode == NULL) { + ret = -ENOMEM; + goto out; + } + } else { + /* + * Inode is already populated in the inode table. + * Which means we already looked up the inode and + * linked with a dentry. So that we will skip + * lookup on this entry, and proceed to next. + */ + linked_inode = loc.inode; + bname = strtok_r(NULL, "/", &save_ptr); + if (!bname) { + goto out; + } + inode_unref(loc.parent); + loc.parent = loc.inode; + gf_uuid_copy(loc.pargfid, loc.inode->gfid); + loc.inode = NULL; + continue; + } - dht_itransform (this, subvol, from->ia_ino, &to->ia_ino); + loc.name = bname; + ret = loc_path(&loc, bname); - to->ia_prot = from->ia_prot; - to->ia_type = from->ia_type; - to->ia_nlink = from->ia_nlink; - to->ia_rdev = from->ia_rdev; - to->ia_size += from->ia_size; - to->ia_blksize = from->ia_blksize; - to->ia_blocks += from->ia_blocks; + ret = syncop_lookup(this, &loc, &iatt, NULL, NULL, NULL); + if (ret) { + gf_smsg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_SELFHEAL_FAILED, + "path=%s", path, "subvolume=%s", this->name, "bname=%s", + bname, NULL); + goto out; + } - set_if_greater (to->ia_uid, from->ia_uid); - set_if_greater (to->ia_gid, from->ia_gid); + linked_inode = inode_link(loc.inode, loc.parent, bname, &iatt); + if (!linked_inode) + goto out; - set_if_greater (to->ia_atime, from->ia_atime); - set_if_greater (to->ia_mtime, from->ia_mtime); - set_if_greater (to->ia_ctime, from->ia_ctime); + loc_wipe(&loc); + gf_uuid_copy(loc.pargfid, linked_inode->gfid); + loc.inode = NULL; - return 0; + bname = strtok_r(NULL, "/", &save_ptr); + if (bname) + loc.parent = linked_inode; + } +out: + inode_ref(linked_inode); + loc_wipe(&loc); + GF_FREE(tmp_path); + + return linked_inode; } int -dht_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +dht_heal_full_path(void *data) { - if (!child) { - goto err; + call_frame_t *heal_frame = data; + dht_local_t *local = NULL; + loc_t loc = { + 0, + }; + dict_t *dict = NULL; + char *path = NULL; + int ret = -1; + xlator_t *source = NULL; + xlator_t *this = NULL; + inode_table_t *itable = NULL; + inode_t *inode = NULL; + inode_t *tmp_inode = NULL; + + GF_VALIDATE_OR_GOTO("DHT", heal_frame, out); + + local = heal_frame->local; + this = heal_frame->this; + source = heal_frame->cookie; + heal_frame->cookie = NULL; + gf_uuid_copy(loc.gfid, local->gfid); + + if (local->loc.inode) + loc.inode = inode_ref(local->loc.inode); + else + goto out; + + itable = loc.inode->table; + ret = syncop_getxattr(source, &loc, &dict, GET_ANCESTRY_PATH_KEY, NULL, + NULL); + if (ret) { + gf_smsg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_HEAL_ABORT, + "subvol=%s", source->name, NULL); + goto out; + } + + ret = dict_get_str(dict, GET_ANCESTRY_PATH_KEY, &path); + if (path) { + inode = dht_heal_path(this, path, itable); + if (inode && inode != local->inode) { + /* + * if inode returned by heal function is different + * from what we passed, which means a racing thread + * already linked a different inode for dentry. + * So we will update our local->inode, so that we can + * retrurn proper inode. + */ + tmp_inode = local->inode; + local->inode = inode; + inode_unref(tmp_inode); + tmp_inode = NULL; + } else { + inode_unref(inode); } + } - if (strcmp (parent->path, "/") == 0) - gf_asprintf ((char **)&child->path, "/%s", name); - else - gf_asprintf ((char **)&child->path, "%s/%s", parent->path, name); +out: + loc_wipe(&loc); + if (dict) + dict_unref(dict); + return 0; +} - if (!child->path) { - goto err; +int +dht_heal_full_path_done(int op_ret, call_frame_t *heal_frame, void *data) +{ + call_frame_t *main_frame = NULL; + dht_local_t *local = NULL; + xlator_t *this = NULL; + int ret = -1; + int op_errno = 0; + + local = heal_frame->local; + main_frame = local->main_frame; + local->main_frame = NULL; + this = heal_frame->this; + + dht_set_fixed_dir_stat(&local->postparent); + if (local->need_xattr_heal) { + local->need_xattr_heal = 0; + ret = dht_dir_xattr_heal(this, local, &op_errno); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s", local->loc.path, + NULL); } + } - child->name = strrchr (child->path, '/'); - if (child->name) - child->name++; + DHT_STACK_UNWIND(lookup, main_frame, 0, 0, local->inode, &local->stbuf, + local->xattr, &local->postparent); - child->parent = inode_ref (parent->inode); - child->inode = inode_new (parent->inode->table); + DHT_STACK_DESTROY(heal_frame); + return 0; +} - if (!child->inode) { - goto err; - } +/* This function must be called inside an inode lock */ +int +__dht_lock_subvol_set(inode_t *inode, xlator_t *this, xlator_t *lock_subvol) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + uint64_t value = 0; - return 0; -err: - loc_wipe (child); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + ret = __inode_ctx_get0(inode, this, &value); + if (ret || !value) { return -1; + } + + ctx = (dht_inode_ctx_t *)(uintptr_t)value; + ctx->lock_subvol = lock_subvol; +out: + return ret; +} + +xlator_t * +dht_get_lock_subvolume(xlator_t *this, struct gf_flock *lock, + dht_local_t *local) +{ + xlator_t *subvol = NULL; + inode_t *inode = NULL; + int32_t ret = -1; + uint64_t value = 0; + xlator_t *cached_subvol = NULL; + dht_inode_ctx_t *ctx = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + GF_VALIDATE_OR_GOTO(this->name, lock, out); + GF_VALIDATE_OR_GOTO(this->name, local, out); + + cached_subvol = local->cached_subvol; + + if (local->loc.inode || local->fd) { + inode = local->loc.inode ? local->loc.inode : local->fd->inode; + } + + if (!inode) + goto out; + + if (!(IA_ISDIR(inode->ia_type) || IA_ISINVAL(inode->ia_type))) { + /* + * We may get non-linked inode for directories as part + * of the selfheal code path. So checking for IA_INVAL + * type also. This will only happen for directory. + */ + subvol = local->cached_subvol; + goto out; + } + + if (lock->l_type != F_UNLCK) { + /* + * inode purging might happen on NFS between a lk + * and unlk. Due to this lk and unlk might be sent + * to different subvols. + * So during a lock request, taking a ref on inode + * to prevent inode purging. inode unref will happen + * in unlock cbk code path. + */ + inode_ref(inode); + } + + LOCK(&inode->lock); + ret = __inode_ctx_get0(inode, this, &value); + if (!ret && value) { + ctx = (dht_inode_ctx_t *)(uintptr_t)value; + subvol = ctx->lock_subvol; + } + if (!subvol && lock->l_type != F_UNLCK && cached_subvol) { + ret = __dht_lock_subvol_set(inode, this, cached_subvol); + if (ret) { + gf_uuid_unparse(inode->gfid, gfid); + UNLOCK(&inode->lock); + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "lock_subvol gfid=%s", gfid, NULL); + goto post_unlock; + } + subvol = cached_subvol; + } + UNLOCK(&inode->lock); +post_unlock: + if (!subvol && inode && lock->l_type != F_UNLCK) { + inode_unref(inode); + } +out: + return subvol; +} + +int +dht_lk_inode_unref(call_frame_t *frame, int32_t op_ret) +{ + int ret = -1; + dht_local_t *local = NULL; + inode_t *inode = NULL; + xlator_t *this = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + this = frame->this; + + if (local->loc.inode || local->fd) { + inode = local->loc.inode ? local->loc.inode : local->fd->inode; + } + if (!inode) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LOCK_INODE_UNREF_FAILED, + NULL); + goto out; + } + + if (!(IA_ISDIR(inode->ia_type) || IA_ISINVAL(inode->ia_type))) { + ret = 0; + goto out; + } + + switch (local->lock_type) { + case F_RDLCK: + case F_WRLCK: + if (op_ret) { + gf_uuid_unparse(inode->gfid, gfid); + gf_msg_debug(this->name, 0, "lock request failed for gfid %s", + gfid); + inode_unref(inode); + goto out; + } + break; + + case F_UNLCK: + if (!op_ret) { + inode_unref(inode); + } else { + gf_uuid_unparse(inode->gfid, gfid); + gf_smsg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_LOCK_INODE_UNREF_FAILED, "gfid=%s", gfid, NULL); + goto out; + } + default: + break; + } + ret = 0; +out: + return ret; +} + +/* Code to update custom extended attributes from src dict to dst dict + */ +void +dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst, + dict_t *src, int *uret, int *uflag) +{ + int ret = -1; + data_t *keyval = NULL; + int luret = -1; + int luflag = -1; + int i = 0; + char **xattrs_to_heal; + + if (!src || !dst) { + gf_smsg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_DST_NULL_SET_FAILED, + "path=%s", local->loc.path, NULL); + return; + } + /* Check if any user xattr present in src dict and set + it to dst dict + */ + luret = dict_foreach_fnmatch(src, "user.*", dht_set_user_xattr, dst); + /* Check if any other custom xattr present in src dict + and set it to dst dict, here index start from 1 because + user xattr already checked in previous statement + */ + + xattrs_to_heal = get_xattrs_to_heal(); + + for (i = 1; xattrs_to_heal[i]; i++) { + keyval = dict_get(src, xattrs_to_heal[i]); + if (keyval) { + luflag = 1; + ret = dict_set(dst, xattrs_to_heal[i], keyval); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, + DHT_MSG_DICT_SET_FAILED, "key=%s", xattrs_to_heal[i], + "path=%s", local->loc.path, NULL); + keyval = NULL; + } + } + if (uret) + (*uret) = luret; + if (uflag) + (*uflag) = luflag; } diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c new file mode 100644 index 00000000000..dbb8070b0da --- /dev/null +++ b/xlators/cluster/dht/src/dht-inode-read.c @@ -0,0 +1,1658 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "dht-common.h" + +static int +dht_access2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); +static int +dht_readv2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); +static int +dht_attr2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); +static int +dht_open2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); +static int +dht_flush2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); +static int +dht_lk2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); +static int +dht_fsync2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); +static int +dht_common_xattrop2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, + int ret); + +static int +dht_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, fd_t *fd, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = 0; + + local = frame->local; + prev = cookie; + + local->op_errno = op_errno; + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto out; + } + + /* Update ctx if the fd has been opened on the target*/ + if (!op_ret && (local->call_cnt == 1)) { + dht_fd_ctx_set(this, fd, prev); + goto out; + } + + if (!op_ret || (local->call_cnt != 1)) + goto out; + + /* rebalance would have happened */ + local->rebalance.target_op_fn = dht_open2; + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + +out: + DHT_STACK_UNWIND(open, frame, op_ret, op_errno, local->fd, xdata); + + return 0; +} + +static int +dht_open2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int op_errno = EINVAL; + + if (!frame || !frame->local) + goto out; + + local = frame->local; + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This DHT layer is not migrating the file */ + DHT_STACK_UNWIND(open, frame, -1, local->op_errno, NULL, + local->rebalance.xdata); + return 0; + } + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; + + STACK_WIND_COOKIE(frame, dht_open_cbk, subvol, subvol, subvol->fops->open, + &local->loc, local->rebalance.flags, local->fd, + local->xattr_req); + return 0; + +out: + DHT_STACK_UNWIND(open, frame, -1, op_errno, NULL, NULL); + return 0; +} + +int +dht_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, loc, fd, GF_FOP_OPEN); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + if (xdata) + local->xattr_req = dict_ref(xdata); + + local->rebalance.flags = flags; + local->call_cnt = 1; + + STACK_WIND_COOKIE(frame, dht_open_cbk, subvol, subvol, subvol->fops->open, + loc, flags, fd, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(open, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int +dht_file_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *stbuf, dict_t *xdata) +{ + xlator_t *subvol1 = 0; + xlator_t *subvol2 = 0; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((local->fop == GF_FOP_FSTAT) && + dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } + + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + local->op_errno = op_errno; + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto out; + } + + if (local->call_cnt != 1) + goto out; + + local->op_errno = op_errno; + local->op_ret = op_ret; + + /* Check if the rebalance phase2 is true */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) { + local->rebalance.target_op_fn = dht_attr2; + dht_set_local_rebalance(this, local, NULL, NULL, stbuf, xdata); + inode = (local->fd) ? local->fd->inode : local->loc.inode; + + dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2); + if (dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) { + /* Phase 2 of migration */ + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } else { + /* it is a non-fd op or it is an fd based Fop and + opened on the dst.*/ + if (local->fd && !dht_fd_open_on_dst(this, local->fd, subvol2)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } else { + dht_attr2(this, subvol2, frame, 0); + return 0; + } + } + } + +out: + DHT_STRIP_PHASE1_FLAGS(stbuf); + DHT_STACK_UNWIND(stat, frame, op_ret, op_errno, stbuf, xdata); +err: + return 0; +} + +static int +dht_attr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int op_errno = EINVAL; + + local = frame->local; + if (!local) + goto out; + + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(stat, frame, local->op_ret, op_errno, + &local->rebalance.postbuf, local->rebalance.xdata); + return 0; + } + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; + + if (local->fop == GF_FOP_FSTAT) { + STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol, + subvol->fops->fstat, local->fd, local->xattr_req); + } else { + STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol, + subvol->fops->stat, &local->loc, local->xattr_req); + } + + return 0; + +out: + DHT_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL); + return 0; +} + +static int +dht_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *stbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + local = frame->local; + prev = cookie; + + LOCK(&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + UNLOCK(&frame->lock); + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + + goto post_unlock; + } + + dht_iatt_merge(this, &local->stbuf, stbuf); + + local->op_ret = 0; + } + UNLOCK(&frame->lock); +post_unlock: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + DHT_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno, + &local->stbuf, xdata); + } + + return 0; +} + +int +dht_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_STAT); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + if (xdata) + local->xattr_req = dict_ref(xdata); + + if (IA_ISREG(loc->inode->ia_type)) { + local->call_cnt = 1; + + subvol = local->cached_subvol; + + STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol, + subvol->fops->stat, loc, xdata); + + return 0; + } + + local->call_cnt = call_cnt = layout->cnt; + + for (i = 0; i < call_cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND_COOKIE(frame, dht_attr_cbk, subvol, subvol, + subvol->fops->stat, loc, xdata); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int +dht_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_FSTAT); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + if (xdata) + local->xattr_req = dict_ref(xdata); + + if (IA_ISREG(fd->inode->ia_type)) { + local->call_cnt = 1; + + subvol = local->cached_subvol; + + STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol, + subvol->fops->fstat, fd, xdata); + return 0; + } + + local->call_cnt = call_cnt = layout->cnt; + + for (i = 0; i < call_cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND_COOKIE(frame, dht_attr_cbk, subvol, subvol, + subvol->fops->fstat, fd, xdata); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int +dht_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iovec *vector, int count, struct iatt *stbuf, + struct iobref *iobref, dict_t *xdata) +{ + dht_local_t *local = NULL; + int ret = 0; + xlator_t *src_subvol = 0; + xlator_t *dst_subvol = 0; + + local = frame->local; + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + /* This is already second try, no need for re-check */ + if (local->call_cnt != 1) + goto out; + + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } + + if ((op_ret == -1) && !dht_inode_missing(op_errno)) + goto out; + + local->op_errno = op_errno; + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) { + local->op_ret = op_ret; + local->rebalance.target_op_fn = dht_readv2; + dht_set_local_rebalance(this, local, NULL, NULL, stbuf, xdata); + /* File would be migrated to other node */ + ret = dht_inode_ctx_get_mig_info(this, local->fd->inode, &src_subvol, + &dst_subvol); + + if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol, + dst_subvol) || + !dht_fd_open_on_dst(this, local->fd, dst_subvol)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } else { + /* value is already set in fd_ctx, that means no need + to check for whether its complete or not. */ + dht_readv2(this, dst_subvol, frame, 0); + return 0; + } + } + +out: + DHT_STRIP_PHASE1_FLAGS(stbuf); + + DHT_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, stbuf, + iobref, xdata); + + return 0; +} + +static int +dht_readv2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int op_errno = EINVAL; + + local = frame->local; + if (!local) + goto out; + + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(readv, frame, local->op_ret, op_errno, NULL, 0, + &local->rebalance.postbuf, NULL, + local->rebalance.xdata); + return 0; + } + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; + + STACK_WIND(frame, dht_readv_cbk, subvol, subvol->fops->readv, local->fd, + local->rebalance.size, local->rebalance.offset, + local->rebalance.flags, local->xattr_req); + + return 0; + +out: + DHT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); + return 0; +} + +int +dht_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off, + uint32_t flags, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_READ); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (xdata) + local->xattr_req = dict_ref(xdata); + + local->rebalance.offset = off; + local->rebalance.size = size; + local->rebalance.flags = flags; + local->call_cnt = 1; + + STACK_WIND(frame, dht_readv_cbk, subvol, subvol->fops->readv, local->fd, + local->rebalance.size, local->rebalance.offset, + local->rebalance.flags, local->xattr_req); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); + + return 0; +} + +static int +dht_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xdata) +{ + int ret = -1; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + xlator_t *prev = NULL; + + local = frame->local; + prev = cookie; + + if (!prev) + goto out; + if (local->call_cnt != 1) + goto out; + if ((op_ret == -1) && + ((op_errno == ENOTCONN) || dht_inode_missing(op_errno)) && + IA_ISDIR(local->loc.inode->ia_type)) { + subvol = dht_subvol_next_available(this, prev); + if (!subvol) + goto out; + + /* check if we are done with visiting every node */ + if (subvol == local->cached_subvol) { + goto out; + } + + STACK_WIND_COOKIE(frame, dht_access_cbk, subvol, subvol, + subvol->fops->access, &local->loc, + local->rebalance.flags, NULL); + return 0; + } + if ((op_ret == -1) && dht_inode_missing(op_errno) && + !(IA_ISDIR(local->loc.inode->ia_type))) { + /* File would be migrated to other node */ + local->op_errno = op_errno; + local->rebalance.target_op_fn = dht_access2; + ret = dht_rebalance_complete_check(frame->this, frame); + if (!ret) + return 0; + } + +out: + DHT_STACK_UNWIND(access, frame, op_ret, op_errno, xdata); + return 0; +} + +static int +dht_access2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int op_errno = EINVAL; + + local = frame->local; + if (!local) + goto out; + + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + + DHT_STACK_UNWIND(access, frame, -1, op_errno, NULL); + return 0; + } + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; + + STACK_WIND_COOKIE(frame, dht_access_cbk, subvol, subvol, + subvol->fops->access, &local->loc, local->rebalance.flags, + local->xattr_req); + + return 0; + +out: + DHT_STACK_UNWIND(access, frame, -1, op_errno, NULL); + return 0; +} + +int +dht_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_ACCESS); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.flags = mask; + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + if (xdata) + local->xattr_req = dict_ref(xdata); + + STACK_WIND_COOKIE(frame, dht_access_cbk, subvol, subvol, + subvol->fops->access, loc, mask, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(access, frame, -1, op_errno, NULL); + + return 0; +} + +int +dht_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *subvol = 0; + int ret = 0; + + local = frame->local; + + local->op_errno = op_errno; + + if (local->call_cnt != 1) + goto out; + + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } + + local->rebalance.target_op_fn = dht_flush2; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + /* If context is set, then send flush() it to the destination */ + dht_inode_ctx_get_mig_info(this, local->fd->inode, NULL, &subvol); + if (subvol && dht_fd_open_on_dst(this, local->fd, subvol)) { + dht_flush2(this, subvol, frame, 0); + return 0; + } + + if (op_errno == EREMOTE) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) { + return 0; + } + } + +out: + DHT_STACK_UNWIND(flush, frame, op_ret, op_errno, xdata); + + return 0; +} + +static int +dht_flush2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + if ((frame == NULL) || (frame->local == NULL)) + goto out; + + local = frame->local; + + op_errno = local->op_errno; + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_flush_cbk, subvol, subvol->fops->flush, local->fd, + local->xattr_req); + + return 0; + +out: + DHT_STACK_UNWIND(flush, frame, -1, op_errno, NULL); + return 0; +} + +int +dht_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_FLUSH); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (xdata) + local->xattr_req = dict_ref(xdata); + + local->call_cnt = 1; + + STACK_WIND(frame, dht_flush_cbk, subvol, subvol->fops->flush, fd, + local->xattr_req); + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(flush, frame, -1, op_errno, NULL); + + return 0; +} + +int +dht_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; + inode_t *inode = NULL; + xlator_t *src_subvol = 0; + xlator_t *dst_subvol = 0; + + local = frame->local; + prev = cookie; + + local->op_errno = op_errno; + + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } + + if (op_ret == -1 && !dht_inode_missing(op_errno)) { + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge(this, postbuf, &local->stbuf); + dht_iatt_merge(this, prebuf, &local->prebuf); + } + goto out; + } + + local->op_ret = op_ret; + inode = local->fd->inode; + + local->rebalance.target_op_fn = dht_fsync2; + dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata); + + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1(postbuf)) { + dht_iatt_merge(this, &local->stbuf, postbuf); + dht_iatt_merge(this, &local->prebuf, prebuf); + + dht_inode_ctx_get_mig_info(this, inode, &src_subvol, &dst_subvol); + + if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol, + dst_subvol) || + !dht_fd_open_on_dst(this, local->fd, dst_subvol)) { + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } else { + dht_fsync2(this, dst_subvol, frame, 0); + return 0; + } + } + +out: + DHT_STRIP_PHASE1_FLAGS(postbuf); + DHT_STRIP_PHASE1_FLAGS(prebuf); + + DHT_STACK_UNWIND(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); + + return 0; +} + +static int +dht_fsync2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + if ((frame == NULL) || (frame->local == NULL)) + goto out; + + local = frame->local; + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(fsync, frame, local->op_ret, op_errno, + &local->rebalance.prebuf, &local->rebalance.postbuf, + local->rebalance.xdata); + return 0; + } + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND_COOKIE(frame, dht_fsync_cbk, subvol, subvol, subvol->fops->fsync, + local->fd, local->rebalance.flags, local->xattr_req); + + return 0; + +out: + DHT_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +dht_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_FSYNC); + if (!local) { + op_errno = ENOMEM; + + goto err; + } + if (xdata) + local->xattr_req = dict_ref(xdata); + + local->call_cnt = 1; + local->rebalance.flags = datasync; + + subvol = local->cached_subvol; + + STACK_WIND_COOKIE(frame, dht_fsync_cbk, subvol, subvol, subvol->fops->fsync, + local->fd, local->rebalance.flags, local->xattr_req); + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +/* TODO: for 'lk()' call, we need some other special error, may be ESTALE to + indicate that lock migration happened on the fd, so we can consider it as + phase 2 of migration */ +static int +dht_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct gf_flock *flock, dict_t *xdata) +{ + dht_local_t *local = NULL; + int ret = -1; + xlator_t *subvol = NULL; + + local = frame->local; + + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + if (local->call_cnt != 1) + goto out; + + local->rebalance.target_op_fn = dht_lk2; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + if (xdata) + local->rebalance.xdata = dict_ref(xdata); + + if (op_errno == EREMOTE) { + dht_inode_ctx_get_mig_info(this, local->fd->inode, NULL, &subvol); + if (subvol && dht_fd_open_on_dst(this, local->fd, subvol)) { + dht_lk2(this, subvol, frame, 0); + return 0; + } else { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) { + return 0; + } + } + } + +out: + dht_lk_inode_unref(frame, op_ret); + DHT_STACK_UNWIND(lk, frame, op_ret, op_errno, flock, xdata); + + return 0; +} + +static int +dht_lk2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + if ((frame == NULL) || (frame->local == NULL)) + goto out; + + local = frame->local; + + op_errno = local->op_errno; + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_lk_cbk, subvol, subvol->fops->lk, local->fd, + local->rebalance.lock_cmd, &local->rebalance.flock, + local->xattr_req); + + return 0; + +out: + DHT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); + return 0; +} + +int +dht_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd, + struct gf_flock *flock, dict_t *xdata) +{ + xlator_t *lock_subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_LK); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->lock_type = flock->l_type; + lock_subvol = dht_get_lock_subvolume(this, flock, local); + if (!lock_subvol) { + gf_msg_debug(this->name, 0, "no lock subvolume for path=%p", fd); + op_errno = EINVAL; + goto err; + } + + /* + local->cached_subvol = lock_subvol; + ret = dht_check_and_open_fd_on_subvol (this, frame); + if (ret) + goto err; + */ + if (xdata) + local->xattr_req = dict_ref(xdata); + + local->rebalance.flock = *flock; + local->rebalance.lock_cmd = cmd; + + local->call_cnt = 1; + + STACK_WIND(frame, dht_lk_cbk, lock_subvol, lock_subvol->fops->lk, fd, cmd, + flock, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +static int +dht_lease_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct gf_lease *lease, dict_t *xdata) +{ + DHT_STACK_UNWIND(lease, frame, op_ret, op_errno, lease, xdata); + + return 0; +} + +int +dht_lease(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct gf_lease *lease, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + subvol = dht_subvol_get_cached(this, loc->inode); + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + /* TODO: for rebalance, we need to preserve the fop arguments */ + STACK_WIND(frame, dht_lease_cbk, subvol, subvol->fops->lease, loc, lease, + xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(lease, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +/* Symlinks are currently not migrated, so no need for any check here */ +static int +dht_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, const char *path, struct iatt *stbuf, + dict_t *xdata) +{ + dht_local_t *local = NULL; + + local = frame->local; + if (op_ret == -1) + goto err; + + if (!local) { + op_ret = -1; + op_errno = EINVAL; + } + +err: + DHT_STRIP_PHASE1_FLAGS(stbuf); + DHT_STACK_UNWIND(readlink, frame, op_ret, op_errno, path, stbuf, xdata); + + return 0; +} + +int +dht_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_READLINK); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + STACK_WIND(frame, dht_readlink_cbk, subvol, subvol->fops->readlink, loc, + size, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(readlink, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +/* Get both DHT_IATT_IN_XDATA_KEY and DHT_MODE_IN_XDATA_KEY + * Use DHT_MODE_IN_XDATA_KEY if available, else fall back to + * DHT_IATT_IN_XDATA_KEY + * This will return a dummy iatt with only the mode and type set + */ +static int +dht_read_iatt_from_xdata(dict_t *xdata, struct iatt *stbuf) +{ + int ret = -1; + int32_t mode = 0; + + ret = dict_get_int32(xdata, DHT_MODE_IN_XDATA_KEY, &mode); + + if (ret) { + ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf); + } else { + stbuf->ia_prot = ia_prot_from_st_mode(mode); + stbuf->ia_type = ia_type_from_st_mode(mode); + } + + return ret; +} + +int +dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *call_frame = NULL; + xlator_t *prev = NULL; + xlator_t *src_subvol = NULL; + xlator_t *dst_subvol = NULL; + struct iatt stbuf = { + 0, + }; + int ret = -1; + inode_t *inode = NULL; + + local = frame->local; + call_frame = cookie; + prev = call_frame->this; + + local->op_errno = op_errno; + + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1.", + prev->name); + goto out; + } + + if (local->call_cnt != 1) + goto out; + + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } + + ret = dht_read_iatt_from_xdata(xdata, &stbuf); + + if ((!op_ret) && (ret)) { + /* This is a potential problem and can cause corruption + * with sharding. + * Oh well. We tried. + */ + goto out; + } + + local->op_ret = op_ret; + local->rebalance.target_op_fn = dht_common_xattrop2; + if (xdata) + local->rebalance.xdata = dict_ref(xdata); + + if (dict) + local->rebalance.dict = dict_ref(dict); + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(&stbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1(&stbuf)) { + inode = local->loc.inode ? local->loc.inode : local->fd->inode; + dht_inode_ctx_get_mig_info(this, inode, &src_subvol, &dst_subvol); + + if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol, + dst_subvol) || + !dht_fd_open_on_dst(this, local->fd, dst_subvol)) { + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } else { + dht_common_xattrop2(this, dst_subvol, frame, 0); + return 0; + } + } + +out: + if (local->fop == GF_FOP_XATTROP) { + DHT_STACK_UNWIND(xattrop, frame, op_ret, op_errno, dict, xdata); + } else { + DHT_STACK_UNWIND(fxattrop, frame, op_ret, op_errno, dict, xdata); + } + + return 0; +} + +static int +dht_common_xattrop2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, + int ret) +{ + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + if ((frame == NULL) || (frame->local == NULL)) + goto out; + + local = frame->local; + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + if (local->fop == GF_FOP_XATTROP) { + DHT_STACK_UNWIND(xattrop, frame, local->op_ret, op_errno, + local->rebalance.dict, local->rebalance.xdata); + } else { + DHT_STACK_UNWIND(fxattrop, frame, local->op_ret, op_errno, + local->rebalance.dict, local->rebalance.xdata); + } + + return 0; + } + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; /* This is the second attempt */ + + if (local->fop == GF_FOP_XATTROP) { + STACK_WIND(frame, dht_common_xattrop_cbk, subvol, subvol->fops->xattrop, + &local->loc, local->rebalance.flags, local->rebalance.xattr, + local->xattr_req); + } else { + STACK_WIND(frame, dht_common_xattrop_cbk, subvol, + subvol->fops->fxattrop, local->fd, local->rebalance.flags, + local->rebalance.xattr, local->xattr_req); + } + + return 0; + +out: + + /* If local is unavailable we could be unwinding the wrong + * function here */ + + if (local && (local->fop == GF_FOP_XATTROP)) { + DHT_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL); + } else { + DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL); + } + return 0; +} + +static int +dht_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + DHT_STACK_UNWIND(xattrop, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +/* Set both DHT_IATT_IN_XDATA_KEY and DHT_MODE_IN_XDATA_KEY + * Use DHT_MODE_IN_XDATA_KEY if available. Else fall back to + * DHT_IATT_IN_XDATA_KEY + */ +static int +dht_request_iatt_in_xdata(dict_t *xattr_req) +{ + int ret = -1; + + ret = dict_set_int8(xattr_req, DHT_MODE_IN_XDATA_KEY, 1); + ret = dict_set_int8(xattr_req, DHT_IATT_IN_XDATA_KEY, 1); + + /* At least one call succeeded */ + return ret; +} + +int +dht_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + int ret = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_XATTROP); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for gfid=%s", + uuid_utoa(loc->inode->gfid)); + op_errno = EINVAL; + goto err; + } + + /* Todo : Handle dirs as well. At the moment the only xlator above dht + * that uses xattrop is sharding and that is only for files */ + + if (IA_ISDIR(loc->inode->ia_type)) { + STACK_WIND(frame, dht_xattrop_cbk, subvol, subvol->fops->xattrop, loc, + flags, dict, xdata); + + } else { + local->xattr_req = xdata ? dict_ref(xdata) : dict_new(); + local->call_cnt = 1; + + local->rebalance.xattr = dict_ref(dict); + local->rebalance.flags = flags; + + ret = dht_request_iatt_in_xdata(local->xattr_req); + + if (ret) { + gf_msg_debug(this->name, 0, + "Failed to set dictionary key %s file=%s", + DHT_IATT_IN_XDATA_KEY, loc->path); + } + + STACK_WIND(frame, dht_common_xattrop_cbk, subvol, subvol->fops->xattrop, + loc, local->rebalance.flags, local->rebalance.xattr, + local->xattr_req); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +static int +dht_fxattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + DHT_STACK_UNWIND(fxattrop, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +int +dht_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + int ret = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + subvol = dht_subvol_get_cached(this, fd->inode); + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init(frame, NULL, fd, GF_FOP_FXATTROP); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + /* Todo : Handle dirs as well. At the moment the only xlator above dht + * that uses xattrop is sharding and that is only for files */ + + if (IA_ISDIR(fd->inode->ia_type)) { + STACK_WIND(frame, dht_fxattrop_cbk, subvol, subvol->fops->fxattrop, fd, + flags, dict, xdata); + + } else { + local->xattr_req = xdata ? dict_ref(xdata) : dict_new(); + local->call_cnt = 1; + + local->rebalance.xattr = dict_ref(dict); + local->rebalance.flags = flags; + + ret = dht_request_iatt_in_xdata(local->xattr_req); + + if (ret) { + gf_msg_debug(this->name, 0, "Failed to set dictionary key %s fd=%p", + DHT_IATT_IN_XDATA_KEY, fd); + } + + STACK_WIND(frame, dht_common_xattrop_cbk, subvol, + subvol->fops->fxattrop, fd, local->rebalance.flags, + local->rebalance.xattr, local->xattr_req); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +/* Currently no translators on top of 'distribute' will be using + * below fops, hence not implementing 'migration' related checks + */ + +static int +dht_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) + +{ + dht_lk_inode_unref(frame, op_ret); + DHT_STACK_UNWIND(inodelk, frame, op_ret, op_errno, xdata); + return 0; +} + +int32_t +dht_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + int32_t cmd, struct gf_flock *lock, dict_t *xdata) +{ + xlator_t *lock_subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_INODELK); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->lock_type = lock->l_type; + lock_subvol = dht_get_lock_subvolume(this, lock, local); + if (!lock_subvol) { + gf_msg_debug(this->name, 0, "no lock subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local->call_cnt = 1; + + STACK_WIND(frame, dht_inodelk_cbk, lock_subvol, lock_subvol->fops->inodelk, + volume, loc, cmd, lock, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(inodelk, frame, -1, op_errno, NULL); + + return 0; +} + +int +dht_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) + +{ + dht_local_t *local = NULL; + int ret = 0; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + + local = frame->local; + + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } + +out: + dht_lk_inode_unref(frame, op_ret); + DHT_STACK_UNWIND(finodelk, frame, op_ret, op_errno, xdata); + + return 0; +} + +int +dht_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *lock, dict_t *xdata) +{ + xlator_t *lock_subvol = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_INODELK); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->call_cnt = 1; + local->lock_type = lock->l_type; + + lock_subvol = dht_get_lock_subvolume(this, lock, local); + if (!lock_subvol) { + gf_msg_debug(this->name, 0, "no lock subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + /* + local->cached_subvol = lock_subvol; + ret = dht_check_and_open_fd_on_subvol (this, frame); + if (ret) + goto err; + */ + local->rebalance.flock = *lock; + local->rebalance.lock_cmd = cmd; + local->key = gf_strdup(volume); + + if (xdata) + local->xattr_req = dict_ref(xdata); + + STACK_WIND(frame, dht_finodelk_cbk, lock_subvol, + lock_subvol->fops->finodelk, volume, fd, cmd, lock, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(finodelk, frame, -1, op_errno, NULL); + + return 0; +} diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c new file mode 100644 index 00000000000..2f23ce90fbd --- /dev/null +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -0,0 +1,1404 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "dht-common.h" + +static int +dht_writev2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret); +static int +dht_truncate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret); +static int +dht_setattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret); +static int +dht_fallocate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret); +static int +dht_discard2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret); +static int +dht_zerofill2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret); + +int +dht_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; + xlator_t *subvol1 = NULL; + xlator_t *subvol2 = NULL; + + local = frame->local; + prev = cookie; + + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + /* writev fails with EBADF if dht has not yet opened the fd + * on the cached subvol. This could happen if the file was migrated + * and a lookup updated the cached subvol in the inode ctx. + * We only check once as this could be a valid bad fd error. + */ + + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } + + if (op_ret == -1 && !dht_inode_missing(op_errno)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_msg_debug(this->name, 0, "subvolume %s returned -1 (%s)", prev->name, + strerror(op_errno)); + goto out; + } + + if (local->call_cnt != 1) { + /* preserve the modes of source */ + if (local->stbuf.ia_blocks) { + dht_iatt_merge(this, postbuf, &local->stbuf); + dht_iatt_merge(this, prebuf, &local->prebuf); + } + goto out; + } + + local->rebalance.target_op_fn = dht_writev2; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + /* We might need to pass the stbuf information to the higher DHT + * layer for appropriate handling. + */ + + dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata); + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1(postbuf)) { + if (!local->xattr_req) { + local->xattr_req = dict_new(); + if (!local->xattr_req) { + gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM, + "insufficient memory"); + local->op_errno = ENOMEM; + local->op_ret = -1; + goto out; + } + } + + ret = dict_set_uint32(local->xattr_req, GF_PROTECT_FROM_EXTERNAL_WRITES, + 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_DICT_SET_FAILED, 0, + "Failed to set key %s in dictionary", + GF_PROTECT_FROM_EXTERNAL_WRITES); + local->op_errno = ENOMEM; + local->op_ret = -1; + goto out; + } + + dht_iatt_merge(this, &local->stbuf, postbuf); + dht_iatt_merge(this, &local->prebuf, prebuf); + + ret = dht_inode_ctx_get_mig_info(this, local->fd->inode, &subvol1, + &subvol2); + if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) { + if (dht_fd_open_on_dst(this, local->fd, subvol2)) { + dht_writev2(this, subvol2, frame, 0); + return 0; + } + } + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS(postbuf); + DHT_STRIP_PHASE1_FLAGS(prebuf); + + DHT_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, xdata); + + return 0; +} + +static int +dht_writev2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + if ((frame == NULL) || (frame->local == NULL)) + goto out; + + local = frame->local; + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(writev, frame, local->op_ret, local->op_errno, + &local->rebalance.prebuf, &local->rebalance.postbuf, + local->rebalance.xdata); + return 0; + } + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND_COOKIE(frame, dht_writev_cbk, subvol, subvol, + subvol->fops->writev, local->fd, local->rebalance.vector, + local->rebalance.count, local->rebalance.offset, + local->rebalance.flags, local->rebalance.iobref, + local->xattr_req); + + return 0; + +out: + DHT_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +dht_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int count, off_t off, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_WRITE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (xdata) + local->xattr_req = dict_ref(xdata); + + local->rebalance.vector = iov_dup(vector, count); + local->rebalance.offset = off; + local->rebalance.count = count; + local->rebalance.flags = flags; + local->rebalance.iobref = iobref_ref(iobref); + local->call_cnt = 1; + + STACK_WIND_COOKIE(frame, dht_writev_cbk, subvol, subvol, + subvol->fops->writev, fd, local->rebalance.vector, + local->rebalance.count, local->rebalance.offset, + local->rebalance.flags, local->rebalance.iobref, + local->xattr_req); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +dht_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; + xlator_t *src_subvol = NULL; + xlator_t *dst_subvol = NULL; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + + local = frame->local; + prev = cookie; + + /* Needs to be checked only for ftruncate. + * ftruncate fails with EBADF/EINVAL if dht has not yet opened the fd + * on the cached subvol. This could happen if the file was migrated + * and a lookup updated the cached subvol in the inode ctx. + * We only check once as this could actually be a valid error. + */ + + if ((local->fop == GF_FOP_FTRUNCATE) && + dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } + + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge(this, postbuf, &local->stbuf); + dht_iatt_merge(this, prebuf, &local->prebuf); + } + goto out; + } + + local->rebalance.target_op_fn = dht_truncate2; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + /* We might need to pass the stbuf information to the higher DHT + * layer for appropriate handling. + */ + + dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata); + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1(postbuf)) { + dht_iatt_merge(this, &local->stbuf, postbuf); + dht_iatt_merge(this, &local->prebuf, prebuf); + + inode = (local->fd) ? local->fd->inode : local->loc.inode; + + dht_inode_ctx_get_mig_info(this, inode, &src_subvol, &dst_subvol); + if (!dht_mig_info_is_invalid(local->cached_subvol, src_subvol, + dst_subvol)) { + if ((!local->fd) || + ((local->fd) && + dht_fd_open_on_dst(this, local->fd, dst_subvol))) { + dht_truncate2(this, dst_subvol, frame, 0); + return 0; + } + } + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS(postbuf); + DHT_STRIP_PHASE1_FLAGS(prebuf); + + DHT_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, xdata); +err: + return 0; +} + +static int +dht_truncate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + if (!frame || !frame->local) + goto out; + + local = frame->local; + op_errno = local->op_errno; + + /* This dht xlator is not migrating the file */ + if (we_are_not_migrating(ret)) { + DHT_STACK_UNWIND(truncate, frame, local->op_ret, local->op_errno, + &local->rebalance.prebuf, &local->rebalance.postbuf, + local->rebalance.xdata); + return 0; + } + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; /* This is the second attempt */ + + if (local->fop == GF_FOP_TRUNCATE) { + STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol, + subvol->fops->truncate, &local->loc, + local->rebalance.offset, local->xattr_req); + } else { + STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol, + subvol->fops->ftruncate, local->fd, + local->rebalance.offset, local->xattr_req); + } + + return 0; + +out: + DHT_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +dht_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_TRUNCATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for gfid=%s", + uuid_utoa(loc->inode->gfid)); + op_errno = EINVAL; + goto err; + } + + if (xdata) + local->xattr_req = dict_ref(xdata); + + STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol, + subvol->fops->truncate, loc, offset, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +dht_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_FTRUNCATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (xdata) + local->xattr_req = dict_ref(xdata); + + STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol, + subvol->fops->ftruncate, fd, local->rebalance.offset, + local->xattr_req); + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; + xlator_t *src_subvol = NULL; + xlator_t *dst_subvol = NULL; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + + local = frame->local; + prev = cookie; + + /* fallocate fails with EBADF if dht has not yet opened the fd + * on the cached subvol. This could happen if the file was migrated + * and a lookup updated the cached subvol in the inode ctx. + * We only check once as this could actually be a valid error. + */ + + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } + + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge(this, postbuf, &local->stbuf); + dht_iatt_merge(this, prebuf, &local->prebuf); + } + goto out; + } + + local->op_ret = op_ret; + local->op_errno = op_errno; + local->rebalance.target_op_fn = dht_fallocate2; + + dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata); + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1(postbuf)) { + dht_iatt_merge(this, &local->stbuf, postbuf); + dht_iatt_merge(this, &local->prebuf, prebuf); + + dht_inode_ctx_get_mig_info(this, local->fd->inode, &src_subvol, + &dst_subvol); + if (!dht_mig_info_is_invalid(local->cached_subvol, src_subvol, + dst_subvol)) { + if (dht_fd_open_on_dst(this, local->fd, dst_subvol)) { + dht_fallocate2(this, dst_subvol, frame, 0); + return 0; + } + } + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS(postbuf); + DHT_STRIP_PHASE1_FLAGS(prebuf); + + DHT_STACK_UNWIND(fallocate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); +err: + return 0; +} + +static int +dht_fallocate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + if (!frame || !frame->local) + goto out; + + local = frame->local; + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(fallocate, frame, local->op_ret, local->op_errno, + &local->rebalance.prebuf, &local->rebalance.postbuf, + local->rebalance.xdata); + return 0; + } + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND_COOKIE(frame, dht_fallocate_cbk, subvol, subvol, + subvol->fops->fallocate, local->fd, + local->rebalance.flags, local->rebalance.offset, + local->rebalance.size, local->xattr_req); + + return 0; + +out: + DHT_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_FALLOCATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.flags = mode; + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (xdata) + local->xattr_req = dict_ref(xdata); + + STACK_WIND_COOKIE(frame, dht_fallocate_cbk, subvol, subvol, + subvol->fops->fallocate, fd, local->rebalance.flags, + local->rebalance.offset, local->rebalance.size, + local->xattr_req); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; + xlator_t *src_subvol = NULL; + xlator_t *dst_subvol = NULL; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + + local = frame->local; + prev = cookie; + + /* discard fails with EBADF if dht has not yet opened the fd + * on the cached subvol. This could happen if the file was migrated + * and a lookup updated the cached subvol in the inode ctx. + * We only check once as this could actually be a valid error. + */ + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } + + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge(this, postbuf, &local->stbuf); + dht_iatt_merge(this, prebuf, &local->prebuf); + } + goto out; + } + + local->rebalance.target_op_fn = dht_discard2; + local->op_ret = op_ret; + local->op_errno = op_errno; + + dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata); + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1(postbuf)) { + dht_iatt_merge(this, &local->stbuf, postbuf); + dht_iatt_merge(this, &local->prebuf, prebuf); + + dht_inode_ctx_get_mig_info(this, local->fd->inode, &src_subvol, + &dst_subvol); + if (!dht_mig_info_is_invalid(local->cached_subvol, src_subvol, + dst_subvol)) { + if (dht_fd_open_on_dst(this, local->fd, dst_subvol)) { + dht_discard2(this, dst_subvol, frame, 0); + return 0; + } + } + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS(postbuf); + DHT_STRIP_PHASE1_FLAGS(prebuf); + + DHT_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, postbuf, xdata); +err: + return 0; +} + +static int +dht_discard2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + if (!frame || !frame->local) + goto out; + + local = frame->local; + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(discard, frame, local->op_ret, local->op_errno, + &local->rebalance.prebuf, &local->rebalance.postbuf, + local->rebalance.xdata); + return 0; + } + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND_COOKIE(frame, dht_discard_cbk, subvol, subvol, + subvol->fops->discard, local->fd, local->rebalance.offset, + local->rebalance.size, local->xattr_req); + + return 0; + +out: + DHT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_DISCARD); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (xdata) + local->xattr_req = dict_ref(xdata); + + STACK_WIND_COOKIE(frame, dht_discard_cbk, subvol, subvol, + subvol->fops->discard, fd, local->rebalance.offset, + local->rebalance.size, local->xattr_req); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; + xlator_t *subvol1 = NULL, *subvol2 = NULL; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + + local = frame->local; + prev = cookie; + + /* zerofill fails with EBADF if dht has not yet opened the fd + * on the cached subvol. This could happen if the file was migrated + * and a lookup updated the cached subvol in the inode ctx. + * We only check once as this could actually be a valid error. + */ + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } + + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge(this, postbuf, &local->stbuf); + dht_iatt_merge(this, prebuf, &local->prebuf); + } + goto out; + } + + local->rebalance.target_op_fn = dht_zerofill2; + local->op_ret = op_ret; + local->op_errno = op_errno; + + dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata); + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1(postbuf)) { + dht_iatt_merge(this, &local->stbuf, postbuf); + dht_iatt_merge(this, &local->prebuf, prebuf); + + ret = dht_inode_ctx_get_mig_info(this, local->fd->inode, &subvol1, + &subvol2); + if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) { + if (dht_fd_open_on_dst(this, local->fd, subvol2)) { + dht_zerofill2(this, subvol2, frame, 0); + return 0; + } + } + + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS(postbuf); + DHT_STRIP_PHASE1_FLAGS(prebuf); + + DHT_STACK_UNWIND(zerofill, frame, op_ret, op_errno, prebuf, postbuf, xdata); +err: + return 0; +} + +static int +dht_zerofill2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + if (!frame || !frame->local) + goto out; + + local = frame->local; + + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(zerofill, frame, local->op_ret, local->op_errno, + &local->rebalance.prebuf, &local->rebalance.postbuf, + local->rebalance.xdata); + + return 0; + } + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND_COOKIE(frame, dht_zerofill_cbk, subvol, subvol, + subvol->fops->zerofill, local->fd, + local->rebalance.offset, local->rebalance.size, + local->xattr_req); + + return 0; + +out: + + DHT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_ZEROFILL); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (xdata) + local->xattr_req = dict_ref(xdata); + + STACK_WIND_COOKIE(frame, dht_zerofill_cbk, subvol, subvol, + subvol->fops->zerofill, fd, local->rebalance.offset, + local->rebalance.size, local->xattr_req); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +/* handle cases of migration here for 'setattr()' calls */ +int +dht_file_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; + + local = frame->local; + prev = cookie; + + local->op_errno = op_errno; + + if ((local->fop == GF_FOP_FSETATTR) && + dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } + + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto out; + } + + if (local->call_cnt != 1) + goto out; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + local->rebalance.target_op_fn = dht_setattr2; + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) { + dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata); + + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } + + /* At the end of the migration process, whatever 'attr' we + have on source file will be migrated to destination file + in one shot, hence we don't need to check for in progress + state here (ie, PHASE1) */ +out: + DHT_STRIP_PHASE1_FLAGS(postbuf); + DHT_STRIP_PHASE1_FLAGS(prebuf); + + DHT_STACK_UNWIND(setattr, frame, op_ret, op_errno, prebuf, postbuf, xdata); + + return 0; +} + +static int +dht_setattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + if (!frame || !frame->local) + goto out; + + local = frame->local; + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno, + &local->rebalance.prebuf, &local->rebalance.postbuf, + local->rebalance.xdata); + return 0; + } + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; /* This is the second attempt */ + + if (local->fop == GF_FOP_SETATTR) { + STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol, + subvol->fops->setattr, &local->loc, + &local->rebalance.stbuf, local->rebalance.flags, + local->xattr_req); + } else { + STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol, + subvol->fops->fsetattr, local->fd, + &local->rebalance.stbuf, local->rebalance.flags, + local->xattr_req); + } + + return 0; + +out: + DHT_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +/* Keep the existing code same for all the cases other than regular file */ +int +dht_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *statpre, struct iatt *statpost, + dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + + local = frame->local; + prev = cookie; + + LOCK(&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + UNLOCK(&frame->lock); + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto post_unlock; + } + + dht_iatt_merge(this, &local->prebuf, statpre); + dht_iatt_merge(this, &local->stbuf, statpost); + + local->op_ret = 0; + local->op_errno = 0; + } + UNLOCK(&frame->lock); +post_unlock: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + if (local->op_ret == 0) + dht_inode_ctx_time_set(local->loc.inode, this, &local->stbuf); + DHT_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->stbuf, xdata); + } + + return 0; +} + +/* Keep the existing code same for all the cases other than regular file */ +int +dht_non_mds_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + gf_msg(this->name, op_errno, 0, 0, "subvolume %s returned -1", + prev->name); + goto post_unlock; + } + + LOCK(&frame->lock); + { + dht_iatt_merge(this, &local->prebuf, statpre); + dht_iatt_merge(this, &local->stbuf, statpost); + + local->op_ret = 0; + local->op_errno = 0; + } + UNLOCK(&frame->lock); +post_unlock: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + dht_inode_ctx_time_set(local->loc.inode, this, &local->stbuf); + DHT_STACK_UNWIND(setattr, frame, 0, 0, &local->prebuf, &local->stbuf, + xdata); + } + + return 0; +} + +int +dht_mds_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) + +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *prev = NULL; + xlator_t *mds_subvol = NULL; + struct iatt loc_stbuf = { + 0, + }; + int i = 0; + + local = frame->local; + prev = cookie; + conf = this->private; + mds_subvol = local->mds_subvol; + + if (op_ret == -1) { + local->op_ret = op_ret; + local->op_errno = op_errno; + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto out; + } + + local->op_ret = 0; + loc_stbuf = local->stbuf; + dht_iatt_merge(this, &local->prebuf, statpre); + dht_iatt_merge(this, &local->stbuf, statpost); + + local->call_cnt = conf->subvolume_cnt - 1; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (mds_subvol == conf->subvolumes[i]) + continue; + STACK_WIND_COOKIE(frame, dht_non_mds_setattr_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->setattr, &local->loc, + &loc_stbuf, local->valid, local->xattr_req); + } + + return 0; +out: + DHT_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->stbuf, xdata); + + return 0; +} + +int +dht_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + xlator_t *subvol = NULL; + xlator_t *mds_subvol = NULL; + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + int ret = -1; + int call_cnt = 0; + dht_conf_t *conf = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + + conf = this->private; + local = dht_local_init(frame, loc, NULL, GF_FOP_SETATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane(layout)) { + gf_msg_debug(this->name, 0, "layout is not sane for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + if (xdata) + local->xattr_req = dict_ref(xdata); + + if (IA_ISREG(loc->inode->ia_type)) { + /* in the regular file _cbk(), we need to check for + migration possibilities */ + local->rebalance.stbuf = *stbuf; + local->rebalance.flags = valid; + local->call_cnt = 1; + subvol = local->cached_subvol; + + STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol, + subvol->fops->setattr, loc, stbuf, valid, xdata); + + return 0; + } + + local->call_cnt = call_cnt = layout->cnt; + + if (IA_ISDIR(loc->inode->ia_type) && !__is_root_gfid(loc->inode->gfid) && + call_cnt != 1) { + ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol); + if (ret || !mds_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Failed to get mds subvol for path %s", local->loc.path); + op_errno = EINVAL; + goto err; + } + + local->mds_subvol = mds_subvol; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == mds_subvol) { + if (!conf->subvolume_status[i]) { + gf_msg(this->name, GF_LOG_WARNING, layout->list[i].err, + DHT_MSG_HASHED_SUBVOL_DOWN, + "MDS subvol is down for path " + " %s Unable to set attr ", + local->loc.path); + op_errno = ENOTCONN; + goto err; + } + } + } + local->valid = valid; + local->stbuf = *stbuf; + + STACK_WIND_COOKIE(frame, dht_mds_setattr_cbk, local->mds_subvol, + local->mds_subvol, local->mds_subvol->fops->setattr, + loc, stbuf, valid, xdata); + return 0; + } else { + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_setattr_cbk, layout->list[i].xlator, + layout->list[i].xlator, + layout->list[i].xlator->fops->setattr, loc, stbuf, + valid, xdata); + } + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +dht_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + xlator_t *subvol = NULL; + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + int call_cnt = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_FSETATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg_debug(this->name, 0, "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane(layout)) { + gf_msg_debug(this->name, 0, "layout is not sane for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + if (xdata) + local->xattr_req = dict_ref(xdata); + + if (IA_ISREG(fd->inode->ia_type)) { + /* in the regular file _cbk(), we need to check for + migration possibilities */ + local->rebalance.stbuf = *stbuf; + local->rebalance.flags = valid; + local->call_cnt = 1; + subvol = local->cached_subvol; + + STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol, + subvol->fops->fsetattr, fd, &local->rebalance.stbuf, + local->rebalance.flags, local->xattr_req); + return 0; + } + + local->call_cnt = call_cnt = layout->cnt; + + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_setattr_cbk, layout->list[i].xlator, + layout->list[i].xlator, + layout->list[i].xlator->fops->fsetattr, fd, stbuf, + valid, xdata); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index 16767adb9fa..fda904c92c9 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -1,721 +1,808 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - - -#include "glusterfs.h" -#include "xlator.h" #include "dht-common.h" -#include "byte-order.h" +#include <glusterfs/byte-order.h> +#include "unittest/unittest.h" -#define layout_base_size (sizeof (dht_layout_t)) +#define layout_base_size (sizeof(dht_layout_t)) -#define layout_entry_size (sizeof ((dht_layout_t *)NULL)->list[0]) +#define layout_entry_size (sizeof((dht_layout_t *)NULL)->list[0]) #define layout_size(cnt) (layout_base_size + (cnt * layout_entry_size)) - dht_layout_t * -dht_layout_new (xlator_t *this, int cnt) +dht_layout_new(xlator_t *this, int cnt) { - dht_layout_t *layout = NULL; - dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + REQUIRE(NULL != this); + REQUIRE(cnt >= 0); - conf = this->private; + conf = this->private; - layout = GF_CALLOC (1, layout_size (cnt), - gf_dht_mt_dht_layout_t); - if (!layout) { - goto out; - } + layout = GF_CALLOC(1, layout_size(cnt), gf_dht_mt_dht_layout_t); + if (!layout) { + goto out; + } + + layout->type = DHT_HASH_TYPE_DM; + layout->cnt = cnt; + + if (conf) { + layout->spread_cnt = conf->dir_spread_cnt; + layout->gen = conf->gen; + } - layout->type = DHT_HASH_TYPE_DM; - layout->cnt = cnt; - if (conf) - layout->gen = conf->gen; + GF_ATOMIC_INIT(layout->ref, 1); - layout->ref = 1; + ENSURE(NULL != layout); + ENSURE(layout->type == DHT_HASH_TYPE_DM); + ENSURE(layout->cnt == cnt); + ENSURE(GF_ATOMIC_GET(layout->ref) == 1); out: - return layout; + return layout; } - dht_layout_t * -dht_layout_get (xlator_t *this, inode_t *inode) +dht_layout_get(xlator_t *this, inode_t *inode) { - dht_conf_t *conf = NULL; - uint64_t layout_int = 0; - dht_layout_t *layout = NULL; - int ret = -1; - - conf = this->private; - if (!conf) - goto out; - - LOCK (&conf->layout_lock); - { - ret = inode_ctx_get (inode, this, &layout_int); - if (ret == 0) { - layout = (dht_layout_t *) (unsigned long) layout_int; - layout->ref++; - } - } - UNLOCK (&conf->layout_lock); - -out: - return layout; + dht_layout_t *layout = NULL; + int ret = 0; + + ret = dht_inode_ctx_layout_get(inode, this, &layout); + if ((!ret) && layout) { + GF_ATOMIC_INC(layout->ref); + } + return layout; } - int -dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout) +dht_layout_set(xlator_t *this, inode_t *inode, dht_layout_t *layout) { - dht_conf_t *conf = NULL; - int oldret = -1; - int ret = 0; - dht_layout_t *old_layout; - uint64_t old_layout_int; - - conf = this->private; - if (!conf) - goto out; - - LOCK (&conf->layout_lock); - { - oldret = inode_ctx_get (inode, this, &old_layout_int); - - layout->ref++; - ret = inode_ctx_put (inode, this, (uint64_t) (unsigned long) - layout); - } - UNLOCK (&conf->layout_lock); - - if (oldret == 0) { - old_layout = (dht_layout_t *) (unsigned long) old_layout_int; - dht_layout_unref (this, old_layout); - } + dht_conf_t *conf = NULL; + int oldret = -1; + int ret = -1; + dht_layout_t *old_layout; + + conf = this->private; + if (!conf || !layout) + goto out; + + LOCK(&conf->layout_lock); + { + oldret = dht_inode_ctx_layout_get(inode, this, &old_layout); + if (layout) + GF_ATOMIC_INC(layout->ref); + ret = dht_inode_ctx_layout_set(inode, this, layout); + } + UNLOCK(&conf->layout_lock); + + if (!oldret) { + dht_layout_unref(this, old_layout); + } + if (ret) + GF_ATOMIC_DEC(layout->ref); out: - return ret; + return ret; } - void -dht_layout_unref (xlator_t *this, dht_layout_t *layout) +dht_layout_unref(xlator_t *this, dht_layout_t *layout) { - dht_conf_t *conf = NULL; - int ref = 0; + int ref = 0; - if (layout->preset || !this->private) - return; + if (!layout || layout->preset || !this->private) + return; - conf = this->private; + ref = GF_ATOMIC_DEC(layout->ref); - LOCK (&conf->layout_lock); - { - ref = --layout->ref; - } - UNLOCK (&conf->layout_lock); - - if (!ref) - GF_FREE (layout); + if (!ref) + GF_FREE(layout); } - dht_layout_t * -dht_layout_ref (xlator_t *this, dht_layout_t *layout) +dht_layout_ref(xlator_t *this, dht_layout_t *layout) { - dht_conf_t *conf = NULL; + if (layout->preset || !this->private) + return layout; - if (layout->preset || !this->private) - return layout; + GF_ATOMIC_INC(layout->ref); - conf = this->private; - LOCK (&conf->layout_lock); - { - layout->ref++; - } - UNLOCK (&conf->layout_lock); - - return layout; + return layout; } - xlator_t * -dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name) +dht_layout_search(xlator_t *this, dht_layout_t *layout, const char *name) { - uint32_t hash = 0; - xlator_t *subvol = NULL; - int i = 0; - int ret = 0; - - - ret = dht_hash_compute (layout->type, name, &hash); - if (ret != 0) { - gf_log (this->name, GF_LOG_INFO, - "hash computation failed for type=%d name=%s", - layout->type, name); - goto out; - } - - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].start <= hash - && layout->list[i].stop >= hash) { - subvol = layout->list[i].xlator; - break; - } - } - - if (!subvol) { - gf_log (this->name, GF_LOG_INFO, - "no subvolume for hash (value) = %u", hash); - } + uint32_t hash = 0; + xlator_t *subvol = NULL; + int i = 0; + int ret = 0; + + ret = dht_hash_compute(this, layout->type, name, &hash); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_COMPUTE_HASH_FAILED, + "type=%d", layout->type, "name=%s", name, NULL); + goto out; + } + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].start <= hash && layout->list[i].stop >= hash) { + subvol = layout->list[i].xlator; + break; + } + } + + if (!subvol) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "hash-value=0x%x", hash, NULL); + } out: - return subvol; + return subvol; } - dht_layout_t * -dht_layout_for_subvol (xlator_t *this, xlator_t *subvol) +dht_layout_for_subvol(xlator_t *this, xlator_t *subvol) { - dht_conf_t *conf = NULL; - dht_layout_t *layout = NULL; - int i = 0; - - conf = this->private; - if (!conf) - goto out; - - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolumes[i] == subvol) { - layout = conf->file_layouts[i]; - break; - } + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + int i = 0; + + conf = this->private; + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == subvol) { + layout = conf->file_layouts[i]; + break; } + } out: - return layout; + return layout; } - int -dht_layouts_init (xlator_t *this, dht_conf_t *conf) +dht_layouts_init(xlator_t *this, dht_conf_t *conf) { - dht_layout_t *layout = NULL; - int i = 0; - int ret = -1; - - if (!conf) - goto out; - - conf->file_layouts = GF_CALLOC (conf->subvolume_cnt, - sizeof (dht_layout_t *), - gf_dht_mt_dht_layout_t); - if (!conf->file_layouts) { - goto out; - } + dht_layout_t *layout = NULL; + int i = 0; + int ret = -1; - for (i = 0; i < conf->subvolume_cnt; i++) { - layout = dht_layout_new (this, 1); + if (!conf) + goto out; - if (!layout) { - goto out; - } + conf->file_layouts = GF_CALLOC(conf->subvolume_cnt, sizeof(dht_layout_t *), + gf_dht_mt_dht_layout_t); + if (!conf->file_layouts) { + goto out; + } - layout->preset = 1; + for (i = 0; i < conf->subvolume_cnt; i++) { + layout = dht_layout_new(this, 1); - layout->list[0].xlator = conf->subvolumes[i]; - - conf->file_layouts[i] = layout; + if (!layout) { + goto out; } - ret = 0; + layout->preset = 1; + + layout->list[0].xlator = conf->subvolumes[i]; + + conf->file_layouts[i] = layout; + } + + ret = 0; out: - return ret; + return ret; } - int -dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, - int pos, int32_t **disk_layout_p) +dht_disk_layout_extract(xlator_t *this, dht_layout_t *layout, int pos, + int32_t **disk_layout_p) { - int ret = -1; - int32_t *disk_layout = NULL; + int ret = -1; + int32_t *disk_layout = NULL; - disk_layout = GF_CALLOC (5, sizeof (int), - gf_dht_mt_int32_t); - if (!disk_layout) { - goto out; - } + disk_layout = GF_CALLOC(5, sizeof(int), gf_dht_mt_int32_t); + if (!disk_layout) { + goto out; + } - disk_layout[0] = hton32 (1); - disk_layout[1] = hton32 (layout->type); - disk_layout[2] = hton32 (layout->list[pos].start); - disk_layout[3] = hton32 (layout->list[pos].stop); + disk_layout[0] = hton32(layout->list[pos].commit_hash); + disk_layout[1] = hton32(layout->type); + disk_layout[2] = hton32(layout->list[pos].start); + disk_layout[3] = hton32(layout->list[pos].stop); - if (disk_layout_p) - *disk_layout_p = disk_layout; - ret = 0; + if (disk_layout_p) + *disk_layout_p = disk_layout; + else + GF_FREE(disk_layout); + + ret = 0; out: - return ret; + return ret; } - int -dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, - int pos, void *disk_layout_raw) +dht_disk_layout_extract_for_subvol(xlator_t *this, dht_layout_t *layout, + xlator_t *subvol, int32_t **disk_layout_p) { - int cnt = 0; - int type = 0; - int start_off = 0; - int stop_off = 0; - int disk_layout[4]; - - /* TODO: assert disk_layout_ptr is of required length */ - - memcpy (disk_layout, disk_layout_raw, sizeof (disk_layout)); - - cnt = ntoh32 (disk_layout[0]); - if (cnt != 1) { - gf_log (this->name, GF_LOG_INFO, - "disk layout has invalid count %d", cnt); - return -1; - } - - /* TODO: assert type is compatible */ - type = ntoh32 (disk_layout[1]); - start_off = ntoh32 (disk_layout[2]); - stop_off = ntoh32 (disk_layout[3]); + int i = 0; - layout->list[pos].start = start_off; - layout->list[pos].stop = stop_off; + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) + break; + } - gf_log (this->name, GF_LOG_TRACE, - "merged to layout: %u - %u (type %d) from %s", - start_off, stop_off, type, - layout->list[pos].xlator->name); + if (i == layout->cnt) + return -1; - return 0; + return dht_disk_layout_extract(this, layout, i, disk_layout_p); } +static int +dht_disk_layout_merge(xlator_t *this, dht_layout_t *layout, int pos, + void *disk_layout_raw, int disk_layout_len) +{ + int type = 0; + int start_off = 0; + int stop_off = 0; + int commit_hash = 0; + int disk_layout[4]; + + if (!disk_layout_raw) { + gf_smsg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_LAYOUT_MERGE_FAILED, + NULL); + return -1; + } + + GF_ASSERT(disk_layout_len == sizeof(disk_layout)); + + memcpy(disk_layout, disk_layout_raw, disk_layout_len); + + type = ntoh32(disk_layout[1]); + switch (type) { + case DHT_HASH_TYPE_DM_USER: + gf_msg_debug(this->name, 0, "found user-set layout"); + layout->type = type; + /* Fall through. */ + case DHT_HASH_TYPE_DM: + break; + default: + gf_smsg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_INVALID_DISK_LAYOUT, + "layout=%d", disk_layout[1], NULL); + return -1; + } + + commit_hash = ntoh32(disk_layout[0]); + start_off = ntoh32(disk_layout[2]); + stop_off = ntoh32(disk_layout[3]); + + layout->list[pos].commit_hash = commit_hash; + layout->list[pos].start = start_off; + layout->list[pos].stop = stop_off; + + gf_msg_trace(this->name, 0, + "merged to layout: 0x%x - 0x%x (hash 0x%x, type %d) from %s", + start_off, stop_off, commit_hash, type, + layout->list[pos].xlator->name); + + return 0; +} int -dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, - int op_ret, int op_errno, dict_t *xattr) +dht_layout_merge(xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + int op_ret, int op_errno, dict_t *xattr) { - int i = 0; - int ret = -1; - int err = -1; - void *disk_layout_raw = NULL; + int i = 0; + int ret = -1; + int err = -1; + void *disk_layout_raw = NULL; + int disk_layout_len = 0; + dht_conf_t *conf = this->private; + if (op_ret != 0) { + err = op_errno; + } - if (op_ret != 0) { - err = op_errno; - } - - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].xlator == NULL) { - layout->list[i].err = err; - layout->list[i].xlator = subvol; - break; - } - } + if (!layout) + goto out; - if (op_ret != 0) { - ret = 0; - goto out; + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == NULL) { + layout->list[i].err = err; + layout->list[i].xlator = subvol; + break; } + } - if (xattr) { - /* during lookup and not mkdir */ - ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", - &disk_layout_raw); - } + if (op_ret != 0) { + ret = 0; + goto out; + } - if (ret != 0) { - layout->list[i].err = -1; - gf_log (this->name, GF_LOG_TRACE, - "missing disk layout on %s. err = %d", - subvol->name, err); - ret = 0; - goto out; - } + if (xattr) { + /* during lookup and not mkdir */ + ret = dict_get_ptr_and_len(xattr, conf->xattr_name, &disk_layout_raw, + &disk_layout_len); + } - ret = dht_disk_layout_merge (this, layout, i, disk_layout_raw); - if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "layout merge from subvolume %s failed", - subvol->name); - goto out; - } + if (ret != 0) { layout->list[i].err = 0; + gf_msg_trace(this->name, 0, "Missing disk layout on %s. err = %d", + subvol->name, err); + ret = 0; + goto out; + } + + ret = dht_disk_layout_merge(this, layout, i, disk_layout_raw, + disk_layout_len); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED, + "subvolume=%s", subvol->name, NULL); + goto out; + } + + if (layout->commit_hash == 0) { + layout->commit_hash = layout->list[i].commit_hash; + } else if (layout->commit_hash != layout->list[i].commit_hash) { + layout->commit_hash = DHT_LAYOUT_HASH_INVALID; + } + + layout->list[i].err = 0; out: - return ret; + return ret; } +void +dht_layout_entry_swap(dht_layout_t *layout, int i, int j) +{ + uint32_t start_swap = 0; + uint32_t stop_swap = 0; + uint32_t commit_hash_swap = 0; + xlator_t *xlator_swap = 0; + int err_swap = 0; + + start_swap = layout->list[i].start; + stop_swap = layout->list[i].stop; + xlator_swap = layout->list[i].xlator; + err_swap = layout->list[i].err; + commit_hash_swap = layout->list[i].commit_hash; + + layout->list[i].start = layout->list[j].start; + layout->list[i].stop = layout->list[j].stop; + layout->list[i].xlator = layout->list[j].xlator; + layout->list[i].err = layout->list[j].err; + layout->list[i].commit_hash = layout->list[j].commit_hash; + + layout->list[j].start = start_swap; + layout->list[j].stop = stop_swap; + layout->list[j].xlator = xlator_swap; + layout->list[j].err = err_swap; + layout->list[j].commit_hash = commit_hash_swap; +} void -dht_layout_entry_swap (dht_layout_t *layout, int i, int j) +dht_layout_range_swap(dht_layout_t *layout, int i, int j) { - uint32_t start_swap = 0; - uint32_t stop_swap = 0; - xlator_t *xlator_swap = 0; - int err_swap = 0; - - start_swap = layout->list[i].start; - stop_swap = layout->list[i].stop; - xlator_swap = layout->list[i].xlator; - err_swap = layout->list[i].err; - - layout->list[i].start = layout->list[j].start; - layout->list[i].stop = layout->list[j].stop; - layout->list[i].xlator = layout->list[j].xlator; - layout->list[i].err = layout->list[j].err; - - layout->list[j].start = start_swap; - layout->list[j].stop = stop_swap; - layout->list[j].xlator = xlator_swap; - layout->list[j].err = err_swap; + uint32_t start_swap = 0; + uint32_t stop_swap = 0; + + start_swap = layout->list[i].start; + stop_swap = layout->list[i].stop; + + layout->list[i].start = layout->list[j].start; + layout->list[i].stop = layout->list[j].stop; + + layout->list[j].start = start_swap; + layout->list[j].stop = stop_swap; +} +static int64_t +dht_layout_entry_cmp_volname(dht_layout_t *layout, int i, int j) +{ + return (strcmp(layout->list[i].xlator->name, layout->list[j].xlator->name)); } -int64_t -dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j) +gf_boolean_t +dht_is_subvol_in_layout(dht_layout_t *layout, xlator_t *xlator) { - return (strcmp (layout->list[i].xlator->name, - layout->list[j].xlator->name)); + int i = 0; + + for (i = 0; i < layout->cnt; i++) { + /* Check if xlator is already part of layout, and layout is + * non-zero. */ + if (!strcmp(layout->list[i].xlator->name, xlator->name)) { + if (layout->list[i].start != layout->list[i].stop) + return _gf_true; + break; + } + } + return _gf_false; } -int64_t -dht_layout_entry_cmp (dht_layout_t *layout, int i, int j) +static int64_t +dht_layout_entry_cmp(dht_layout_t *layout, int i, int j) { - int64_t diff = 0; + int64_t diff = 0; - if (layout->list[i].err || layout->list[j].err) - diff = layout->list[i].err - layout->list[j].err; - else - diff = (int64_t) layout->list[i].start - - (int64_t) layout->list[j].start; + /* swap zero'ed out layouts to front, if needed */ + if (!layout->list[j].start && !layout->list[j].stop) { + diff = (int64_t)layout->list[i].stop - (int64_t)layout->list[j].stop; + goto out; + } + diff = (int64_t)layout->list[i].start - (int64_t)layout->list[j].start; - return diff; +out: + return diff; } - int -dht_layout_sort (dht_layout_t *layout) +dht_layout_sort(dht_layout_t *layout) { - int i = 0; - int j = 0; - int64_t ret = 0; + int i = 0; + int j = 0; + int64_t ret = 0; - /* TODO: O(n^2) -- bad bad */ + /* TODO: O(n^2) -- bad bad */ - for (i = 0; i < layout->cnt - 1; i++) { - for (j = i + 1; j < layout->cnt; j++) { - ret = dht_layout_entry_cmp (layout, i, j); - if (ret > 0) - dht_layout_entry_swap (layout, i, j); - } + for (i = 0; i < layout->cnt - 1; i++) { + for (j = i + 1; j < layout->cnt; j++) { + ret = dht_layout_entry_cmp(layout, i, j); + if (ret > 0) + dht_layout_entry_swap(layout, i, j); } + } - return 0; + return 0; } -int -dht_layout_sort_volname (dht_layout_t *layout) +void +dht_layout_sort_volname(dht_layout_t *layout) { - int i = 0; - int j = 0; - int64_t ret = 0; + int i = 0; + int j = 0; + int64_t ret = 0; - /* TODO: O(n^2) -- bad bad */ + /* TODO: O(n^2) -- bad bad */ - for (i = 0; i < layout->cnt - 1; i++) { - for (j = i + 1; j < layout->cnt; j++) { - ret = dht_layout_entry_cmp_volname (layout, i, j); - if (ret > 0) - dht_layout_entry_swap (layout, i, j); - } + for (i = 0; i < layout->cnt - 1; i++) { + for (j = i + 1; j < layout->cnt; j++) { + ret = dht_layout_entry_cmp_volname(layout, i, j); + if (ret > 0) + dht_layout_entry_swap(layout, i, j); } - - return 0; + } } - -int -dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, - uint32_t *holes_p, uint32_t *overlaps_p, - uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p) +void +dht_layout_anomalies(xlator_t *this, loc_t *loc, dht_layout_t *layout, + uint32_t *holes_p, uint32_t *overlaps_p, + uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p, + uint32_t *no_space_p) { - uint32_t overlaps = 0; - uint32_t missing = 0; - uint32_t down = 0; - uint32_t misc = 0; - uint32_t hole_cnt = 0; - uint32_t overlap_cnt = 0; - int i = 0; - int ret = 0; - uint32_t prev_stop = 0; - uint32_t last_stop = 0; - char is_virgin = 1; - - /* TODO: explain WTF is happening */ - - last_stop = layout->list[0].start - 1; - prev_stop = last_stop; - - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err) { - switch (layout->list[i].err) { - case -1: - case ENOENT: - missing++; - break; - case ENOTCONN: - down++; - break; - case ENOSPC: - down++; - break; - default: - misc++; - } - continue; + uint32_t overlaps = 0; + uint32_t missing = 0; + uint32_t down = 0; + uint32_t misc = 0; + uint32_t hole_cnt = 0; + uint32_t overlap_cnt = 0; + int i = 0; + uint32_t prev_stop = 0; + uint32_t last_stop = 0; + char is_virgin = 1; + uint32_t no_space = 0; + + /* This function scans through the layout spread of a directory to + check if there are any anomalies. Prior to calling this function + the layout entries should be sorted in the ascending order. + + If the layout entry has err != 0 + then increment the corresponding anomaly. + else + if (start of the current layout entry > stop + 1 of previous + non erroneous layout entry) + then it indicates a hole in the layout + if (start of the current layout entry < stop + 1 of previous + non erroneous layout entry) + then it indicates an overlap in the layout + */ + last_stop = layout->list[0].start - 1; + prev_stop = last_stop; + + for (i = 0; i < layout->cnt; i++) { + switch (layout->list[i].err) { + case -1: + case ENOENT: + case ESTALE: + missing++; + continue; + case ENOTCONN: + down++; + continue; + case ENOSPC: + no_space++; + continue; + case 0: + /* if err == 0 and start == stop, then it is a non misc++; + * participating subvolume(spread-cnt). Then, do not + * check for anomalies. If start != stop, then treat it + * as misc err */ + if (layout->list[i].start == layout->list[i].stop) { + continue; } + break; + default: + misc++; + continue; + } - is_virgin = 0; + is_virgin = 0; - if ((prev_stop + 1) < layout->list[i].start) { - hole_cnt++; - } + if ((prev_stop + 1) < layout->list[i].start) { + hole_cnt++; + } - if ((prev_stop + 1) > layout->list[i].start) { - overlap_cnt++; - overlaps += ((prev_stop + 1) - layout->list[i].start); - } - prev_stop = layout->list[i].stop; + if ((prev_stop + 1) > layout->list[i].start) { + overlap_cnt++; + overlaps += ((prev_stop + 1) - layout->list[i].start); } + prev_stop = layout->list[i].stop; + } - if ((last_stop - prev_stop) || is_virgin) - hole_cnt++; + if ((last_stop - prev_stop) || is_virgin) + hole_cnt++; - if (holes_p) - *holes_p = hole_cnt; + if (holes_p) + *holes_p = hole_cnt; - if (overlaps_p) - *overlaps_p = overlap_cnt; + if (overlaps_p) + *overlaps_p = overlap_cnt; - if (missing_p) - *missing_p = missing; + if (missing_p) + *missing_p = missing; - if (down_p) - *down_p = down; + if (down_p) + *down_p = down; - if (misc_p) - *misc_p = misc; + if (misc_p) + *misc_p = misc; - return ret; + if (no_space_p) + *no_space_p = no_space; } - int -dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) +dht_layout_missing_dirs(dht_layout_t *layout) { - int ret = 0; - int i = 0; - uint32_t holes = 0; - uint32_t overlaps = 0; - uint32_t missing = 0; - uint32_t down = 0; - uint32_t misc = 0; - - - ret = dht_layout_sort (layout); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "sort failed?! how the ...."); - goto out; - } + int i = 0, missing = 0; - ret = dht_layout_anomalies (this, loc, layout, - &holes, &overlaps, - &missing, &down, &misc); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "error while finding anomalies in %s -- not good news", - loc->path); - goto out; - } + if (layout == NULL) + goto out; - if (holes || overlaps) { - if (missing == layout->cnt) { - gf_log (this->name, GF_LOG_DEBUG, - "directory %s looked up first time", - loc->path); - } else { - gf_log (this->name, GF_LOG_INFO, - "found anomalies in %s. holes=%d overlaps=%d", - loc->path, holes, overlaps); - } - ret = 1; - } - - for (i = 0; i < layout->cnt; i++) { - /* TODO During DHT selfheal rewrite (almost) find a better place to - * detect this - probably in dht_layout_anomalies() - */ - if (layout->list[i].err > 0) { - gf_log (this->name, GF_LOG_DEBUG, - "path=%s err=%s on subvol=%s", - loc->path, strerror (layout->list[i].err), - (layout->list[i].xlator ? - layout->list[i].xlator->name : "<>")); - if (layout->list[i].err == ENOENT) - ret = 1; - } + for (i = 0; i < layout->cnt; i++) { + if ((layout->list[i].err == ENOENT) || + ((layout->list[i].err == -1) && (layout->list[i].start == 0) && + (layout->list[i].stop == 0))) { + missing++; } + } out: - return ret; + return missing; } +int +dht_layout_normalize(xlator_t *this, loc_t *loc, dht_layout_t *layout) +{ + int ret = 0; + uint32_t holes = 0; + uint32_t overlaps = 0; + uint32_t missing = 0; + uint32_t down = 0; + uint32_t misc = 0, missing_dirs = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + ret = dht_layout_sort(layout); + if (ret == -1) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SORT_FAILED, + NULL); + goto out; + } + + gf_uuid_unparse(loc->gfid, gfid); + + dht_layout_anomalies(this, loc, layout, &holes, &overlaps, &missing, &down, + &misc, NULL); + + if (holes || overlaps) { + if (missing == layout->cnt) { + gf_msg_debug(this->name, 0, + "Directory %s looked up first time" + " gfid = %s", + loc->path, gfid); + } else { + gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_ANOMALIES_INFO, + "path=%s", loc->path, "gfid=%s", gfid, "holes=%d", holes, + "overlaps=%d", overlaps, NULL); + } + ret = -1; + } + + if (ret >= 0) { + missing_dirs = dht_layout_missing_dirs(layout); + /* TODO During DHT selfheal rewrite (almost) find a better place + * to detect this - probably in dht_layout_anomalies() + */ + if (missing_dirs > 0) + ret += missing_dirs; + } + +out: + return ret; +} int -dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, - loc_t *loc, dict_t *xattr) +dht_dir_has_layout(dict_t *xattr, char *name) { - int idx = 0; - int pos = -1; - int ret = 0; - int err = 0; - int dict_ret = 0; - int32_t disk_layout[4]; - void *disk_layout_raw = NULL; - int32_t count = -1; - uint32_t start_off = -1; - uint32_t stop_off = -1; - - - for (idx = 0; idx < layout->cnt; idx++) { - if (layout->list[idx].xlator == subvol) { - pos = idx; - break; - } - } + void *disk_layout_raw = NULL; - if (pos == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s - no layout info for subvolume %s", - loc->path, subvol->name); - ret = 1; - goto out; - } + return dict_get_ptr(xattr, name, &disk_layout_raw); +} - err = layout->list[pos].err; +int +dht_layout_dir_mismatch(xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + loc_t *loc, dict_t *xattr) +{ + int idx = 0; + int pos = -1; + int ret = 0; + int err = 0; + int dict_ret = 0; + int32_t disk_layout[4]; + void *disk_layout_raw = NULL; + uint32_t start_off = -1; + uint32_t stop_off = -1; + uint32_t commit_hash = -1; + dht_conf_t *conf = this->private; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + if (loc && loc->inode) + gf_uuid_unparse(loc->inode->gfid, gfid); + + for (idx = 0; idx < layout->cnt; idx++) { + if (layout->list[idx].xlator == subvol) { + pos = idx; + break; + } + } + + if (pos == -1) { + if (loc) { + gf_msg_debug(this->name, 0, "%s - no layout info for subvolume %s", + loc ? loc->path : "path not found", subvol->name); + } + ret = 1; + goto out; + } + + err = layout->list[pos].err; + + if (!xattr) { + if (err == 0) { + if (loc) { + gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_XATTR_DICT_NULL, + "path=%s", loc->path, NULL); + } else { + gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_XATTR_DICT_NULL, + "path not found", NULL); + } + ret = -1; + } + goto out; + } + + dict_ret = dict_get_ptr(xattr, conf->xattr_name, &disk_layout_raw); + + if (dict_ret < 0) { + if (err == 0 && layout->list[pos].stop) { + if (loc) { + gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_DISK_LAYOUT_MISSING, + "path=%s", loc->path, "gfid=%s", gfid, NULL); + } else { + gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_DISK_LAYOUT_MISSING, + "path not found" + "gfid=%s", + gfid, NULL); + } + ret = -1; + } + goto out; + } + + memcpy(disk_layout, disk_layout_raw, sizeof(disk_layout)); + + start_off = ntoh32(disk_layout[2]); + stop_off = ntoh32(disk_layout[3]); + commit_hash = ntoh32(disk_layout[0]); + + if ((layout->list[pos].start != start_off) || + (layout->list[pos].stop != stop_off) || + (layout->list[pos].commit_hash != commit_hash)) { + gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_INFO, "subvol=%s", + layout->list[pos].xlator->name, "inode-layout:start=0x%x", + layout->list[pos].start, "inode-layout:stop=0x%x", + layout->list[pos].stop, "layout-commit-hash=0x%x; ", + layout->list[pos].commit_hash, "disk-layout:start-off=0x%x", + start_off, "disk-layout:top-off=0x%x", stop_off, + "commit-hash=0x%x", commit_hash, NULL); + ret = 1; + } else { + ret = 0; + } +out: + return ret; +} - if (!xattr) { - if (err == 0) { - gf_log (this->name, GF_LOG_INFO, - "%s - xattr dictionary is NULL", - loc->path); - ret = -1; - } - goto out; - } +int +dht_layout_preset(xlator_t *this, xlator_t *subvol, inode_t *inode) +{ + dht_layout_t *layout = NULL; + int ret = -1; + dht_conf_t *conf = NULL; - dict_ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", - &disk_layout_raw); + conf = this->private; + if (!conf) + goto out; - if (dict_ret < 0) { - if (err == 0) { - gf_log (this->name, GF_LOG_INFO, - "%s - disk layout missing", loc->path); - ret = -1; - } - goto out; - } + layout = dht_layout_for_subvol(this, subvol); + if (!layout) { + gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_NO_LAYOUT_INFO, + "subvolume=%s", subvol ? subvol->name : "<nil>", NULL); + ret = -1; + goto out; + } - memcpy (disk_layout, disk_layout_raw, sizeof (disk_layout)); + gf_msg_debug(this->name, 0, "file = %s, subvol = %s", + uuid_utoa(inode->gfid), subvol ? subvol->name : "<nil>"); - count = ntoh32 (disk_layout[0]); - if (count != 1) { - gf_log (this->name, GF_LOG_INFO, - "%s - disk layout has invalid count %d", - loc->path, count); - ret = -1; - goto out; - } + LOCK(&conf->layout_lock); + { + dht_inode_ctx_layout_set(inode, this, layout); + } - start_off = ntoh32 (disk_layout[2]); - stop_off = ntoh32 (disk_layout[3]); - - if ((layout->list[pos].start != start_off) - || (layout->list[pos].stop != stop_off)) { - gf_log (this->name, GF_LOG_INFO, - "subvol: %s; inode layout - %"PRIu32" - %"PRIu32"; " - "disk layout - %"PRIu32" - %"PRIu32, - layout->list[pos].xlator->name, - layout->list[pos].start, layout->list[pos].stop, - start_off, stop_off); - ret = 1; - } else { - ret = 0; - } + UNLOCK(&conf->layout_lock); + + ret = 0; out: - return ret; + return ret; } - int -dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode) +dht_layout_index_for_subvol(dht_layout_t *layout, xlator_t *subvol) { - dht_layout_t *layout = NULL; - int ret = -1; - dht_conf_t *conf = NULL; - - conf = this->private; - if (!conf) - goto out; - - layout = dht_layout_for_subvol (this, subvol); - if (!layout) { - gf_log (this->name, GF_LOG_INFO, - "no pre-set layout for subvolume %s", - subvol ? subvol->name : "<nil>"); - ret = -1; - goto out; - } + int i = 0, ret = -1; - LOCK (&conf->layout_lock); - { - inode_ctx_put (inode, this, (uint64_t)(long)layout); + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) { + ret = i; + break; } - UNLOCK (&conf->layout_lock); + } - ret = 0; -out: - return ret; + return ret; } diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c index 9dd487bc87b..89ec6cca56e 100644 --- a/xlators/cluster/dht/src/dht-linkfile.c +++ b/xlators/cluster/dht/src/dht-linkfile.c @@ -1,241 +1,328 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. +#include <glusterfs/compat.h> +#include "dht-common.h" - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ +static int +dht_linkfile_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) +{ + char is_linkfile = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + prev = cookie; + conf = this->private; + + if (op_ret) + goto out; + + gf_uuid_unparse(local->loc.gfid, gfid); + + is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name); + if (!is_linkfile) + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NOT_LINK_FILE_ERROR, + "name=%s", prev->name, "path=%s", local->loc.path, "gfid=%s", + gfid, NULL); +out: + local->linkfile.linkfile_cbk(frame, cookie, this, op_ret, op_errno, inode, + stbuf, postparent, postparent, xattr); + return 0; +} -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif +static int +dht_linkfile_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + dict_t *xattrs = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + local = frame->local; -#include "glusterfs.h" -#include "xlator.h" -#include "compat.h" -#include "dht-common.h" + if (!op_ret) + local->linked = _gf_true; + FRAME_SU_UNDO(frame, dht_local_t); + if (op_ret && (op_errno == EEXIST)) { + conf = this->private; + subvol = cookie; + if (!subvol) + goto out; + xattrs = dict_new(); + if (!xattrs) + goto out; + ret = dict_set_uint32(xattrs, conf->link_xattr_name, 256); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "mame=%s", conf->link_xattr_name, NULL); + goto out; + } + + STACK_WIND_COOKIE(frame, dht_linkfile_lookup_cbk, subvol, subvol, + subvol->fops->lookup, &local->linkfile.loc, xattrs); + if (xattrs) + dict_unref(xattrs); + return 0; + } +out: + local->linkfile.linkfile_cbk(frame, cookie, this, op_ret, op_errno, inode, + stbuf, preparent, postparent, xdata); + if (xattrs) + dict_unref(xattrs); + return 0; +} int -dht_linkfile_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) +dht_linkfile_create(call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, + xlator_t *this, xlator_t *tovol, xlator_t *fromvol, + loc_t *loc) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; + dict_t *dict = NULL; + int need_unref = 0; + int ret = 0; + dht_conf_t *conf = this->private; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + local->linkfile.linkfile_cbk = linkfile_cbk; + local->linkfile.srcvol = tovol; + loc_copy(&local->linkfile.loc, loc); + + local->linked = _gf_false; + + dict = local->params; + if (!dict) { + dict = dict_new(); + if (!dict) + goto out; + need_unref = 1; + } + + if (!gf_uuid_is_null(local->gfid)) { + gf_uuid_unparse(local->gfid, gfid); + + ret = dict_set_gfuuid(dict, "gfid-req", local->gfid, true); + if (ret) + gf_smsg("dht-linkfile", GF_LOG_INFO, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", loc->path, "gfid=%s", gfid, NULL); + } else { + gf_uuid_unparse(loc->gfid, gfid); + } + + ret = dict_set_str(dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + if (ret) + gf_smsg("dht-linkfile", GF_LOG_INFO, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", loc->path, "key=%s", GLUSTERFS_INTERNAL_FOP_KEY, + "gfid=%s", gfid, NULL); + + ret = dict_set_str(dict, conf->link_xattr_name, tovol->name); + + if (ret < 0) { + gf_smsg(frame->this->name, GF_LOG_INFO, 0, DHT_MSG_CREATE_LINK_FAILED, + "path=%s", loc->path, "gfid=%s", gfid, NULL); + goto out; + } + + local->link_subvol = fromvol; + /* Always create as root:root. dht_linkfile_attr_heal fixes the + * ownsership */ + FRAME_SU_DO(frame, dht_local_t); + STACK_WIND_COOKIE(frame, dht_linkfile_create_cbk, fromvol, fromvol, + fromvol->fops->mknod, loc, S_IFREG | DHT_LINKFILE_MODE, 0, + 0, dict); + + if (need_unref && dict) + dict_unref(dict); + + return 0; +out: + local->linkfile.linkfile_cbk(frame, frame->this, frame->this, -1, ENOMEM, + loc->inode, NULL, NULL, NULL, NULL); - local = frame->local; - local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, - local->linkfile.inode, - &local->linkfile.stbuf, NULL, NULL); + if (need_unref && dict) + dict_unref(dict); - return 0; + return 0; } - int -dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, - struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) +dht_linkfile_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - dict_t *xattr = NULL; - data_t *str_data = NULL; - int ret = -1; - - local = frame->local; - prev = cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to create link file (%s)", - local->linkfile.loc.path, strerror (op_errno)); - goto err; - } + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; - xattr = get_new_dict (); - if (!xattr) { - op_errno = ENOMEM; - goto err; - } + local = frame->local; + subvol = cookie; - local->linkfile.xattr = dict_ref (xattr); - local->linkfile.inode = inode_ref (inode); + if (op_ret == -1) { + gf_uuid_unparse(local->loc.gfid, gfid); + gf_smsg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_UNLINK_FAILED, + "path=%s", local->loc.path, "gfid=%s", gfid, "subvolume=%s", + subvol->name, NULL); + } - str_data = str_to_data (local->linkfile.srcvol->name); - if (!str_data) { - op_errno = ENOMEM; - goto err; - } + DHT_STACK_DESTROY(frame); - ret = dict_set (xattr, "trusted.glusterfs.dht.linkto", str_data); - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to initialize linkfile data", - local->linkfile.loc.path); - } - str_data = NULL; + return 0; +} - local->linkfile.stbuf = *stbuf; +int +dht_linkfile_unlink(call_frame_t *frame, xlator_t *this, xlator_t *subvol, + loc_t *loc) +{ + call_frame_t *unlink_frame = NULL; + dht_local_t *unlink_local = NULL; - STACK_WIND (frame, dht_linkfile_xattr_cbk, - prev->this, prev->this->fops->setxattr, - &local->linkfile.loc, local->linkfile.xattr, 0); + unlink_frame = copy_frame(frame); + if (!unlink_frame) { + goto err; + } - return 0; + /* Using non-fop value here, as anyways, 'local->fop' is not used in + this particular case */ + unlink_local = dht_local_init(unlink_frame, loc, NULL, GF_FOP_MAXVALUE); + if (!unlink_local) { + goto err; + } + + STACK_WIND_COOKIE(unlink_frame, dht_linkfile_unlink_cbk, subvol, subvol, + subvol->fops->unlink, &unlink_local->loc, 0, NULL); + return 0; err: - if (str_data) { - data_destroy (str_data); - str_data = NULL; - } + if (unlink_frame) + DHT_STACK_DESTROY(unlink_frame); - local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, - inode, stbuf, preparent, postparent); - return 0; + return -1; } - -int -dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, - xlator_t *tovol, xlator_t *fromvol, loc_t *loc) +xlator_t * +dht_linkfile_subvol(xlator_t *this, inode_t *inode, struct iatt *stbuf, + dict_t *xattr) { - dht_local_t *local = NULL; - dict_t *dict = NULL; - int ret = 0; - - local = frame->local; - local->linkfile.linkfile_cbk = linkfile_cbk; - local->linkfile.srcvol = tovol; - loc_copy (&local->linkfile.loc, loc); - - if (!uuid_is_null (local->gfid)) { - dict = dict_new (); - if (!dict) - goto out; - ret = dict_set_static_bin (dict, "gfid-req", local->gfid, 16); - if (ret) - gf_log ("dht-linkfile", GF_LOG_INFO, - "%s: gfid set failed", loc->path); - } else if (local->params) { - dict = dict_ref (local->params); - } - if (!dict) - gf_log (frame->this->name, GF_LOG_INFO, - "dict is NULL, need to make sure gfid's are same"); + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + void *volname = NULL; + int i = 0, ret = 0; - STACK_WIND (frame, dht_linkfile_create_cbk, - fromvol, fromvol->fops->mknod, loc, - S_IFREG | DHT_LINKFILE_MODE, 0, dict); + conf = this->private; - if (dict) - dict_unref (dict); + if (!xattr) + goto out; - return 0; -out: - local->linkfile.linkfile_cbk (frame, NULL, frame->this, -1, ENOMEM, - loc->inode, NULL, NULL, NULL); - return 0; -} + ret = dict_get_ptr(xattr, conf->link_xattr_name, &volname); + if ((-1 == ret) || !volname) + goto out; -int -dht_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent) -{ - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - xlator_t *subvol = NULL; - - local = frame->local; - prev = cookie; - subvol = prev->this; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "unlinking linkfile %s on %s failed (%s)", - local->loc.path, subvol->name, strerror (op_errno)); + for (i = 0; i < conf->subvolume_cnt; i++) { + if (strcmp(conf->subvolumes[i]->name, (char *)volname) == 0) { + subvol = conf->subvolumes[i]; + break; } + } - DHT_STACK_DESTROY (frame); - - return 0; +out: + return subvol; } - -int -dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, - xlator_t *subvol, loc_t *loc) +static int +dht_linkfile_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) { - call_frame_t *unlink_frame = NULL; - dht_local_t *unlink_local = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; - unlink_frame = copy_frame (frame); - if (!unlink_frame) { - goto err; - } + local = frame->local; + loc = &local->loc; - unlink_local = dht_local_init (unlink_frame); - if (!unlink_local) { - goto err; - } + if (op_ret) + gf_smsg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_SETATTR_FAILED, + "path=%s", (loc->path ? loc->path : "NULL"), "gfid=%s", + uuid_utoa(local->gfid), NULL); - loc_copy (&unlink_local->loc, loc); + DHT_STACK_DESTROY(frame); - STACK_WIND (unlink_frame, dht_linkfile_unlink_cbk, - subvol, subvol->fops->unlink, - &unlink_local->loc); + return 0; +} +int +dht_linkfile_attr_heal(call_frame_t *frame, xlator_t *this) +{ + int ret = -1; + call_frame_t *copy = NULL; + dht_local_t *local = NULL; + dht_local_t *copy_local = NULL; + xlator_t *subvol = NULL; + struct iatt stbuf = { + 0, + }; + dict_t *xattr = NULL; + + local = frame->local; + + GF_VALIDATE_OR_GOTO("dht", local, out); + GF_VALIDATE_OR_GOTO("dht", local->link_subvol, out); + + if (local->stbuf.ia_type == IA_INVAL) return 0; -err: - if (unlink_frame) - DHT_STACK_DESTROY (unlink_frame); - return -1; -} + DHT_MARK_FOP_INTERNAL(xattr); + gf_uuid_copy(local->loc.gfid, local->stbuf.ia_gfid); -xlator_t * -dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf, - dict_t *xattr) -{ - dht_conf_t *conf = NULL; - xlator_t *subvol = NULL; - void *volname = NULL; - int i = 0, ret = 0; + copy = copy_frame(frame); - conf = this->private; + if (!copy) + goto out; - if (!xattr) - goto out; + copy_local = dht_local_init(copy, &local->loc, NULL, 0); - ret = dict_get_ptr (xattr, "trusted.glusterfs.dht.linkto", &volname); + if (!copy_local) + goto out; - if ((-1 == ret) || !volname) - goto out; + stbuf = local->stbuf; + subvol = local->link_subvol; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (strcmp (conf->subvolumes[i]->name, (char *)volname) == 0) { - subvol = conf->subvolumes[i]; - break; - } - } + copy->local = copy_local; + FRAME_SU_DO(copy, dht_local_t); + + STACK_WIND(copy, dht_linkfile_setattr_cbk, subvol, subvol->fops->setattr, + ©_local->loc, &stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + xattr); + ret = 0; out: - return subvol; + if ((ret < 0) && (copy)) + DHT_STACK_DESTROY(copy); + + if (xattr) + dict_unref(xattr); + + return ret; } diff --git a/xlators/cluster/dht/src/dht-lock.c b/xlators/cluster/dht/src/dht-lock.c new file mode 100644 index 00000000000..638821ccee5 --- /dev/null +++ b/xlators/cluster/dht/src/dht-lock.c @@ -0,0 +1,1392 @@ +/* + Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "dht-lock.h" + +static char * +dht_lock_asprintf(dht_lock_t *lock) +{ + char *lk_buf = NULL; + char gfid[GF_UUID_BUF_SIZE] = { + 0, + }; + + if (lock == NULL) + goto out; + + uuid_utoa_r(lock->loc.gfid, gfid); + + gf_asprintf(&lk_buf, "%s:%s", lock->xl->name, gfid); + +out: + return lk_buf; +} + +static void +dht_log_lk_array(char *name, gf_loglevel_t log_level, dht_lock_t **lk_array, + int count) +{ + int i = 0; + char *lk_buf = NULL; + + if ((lk_array == NULL) || (count == 0)) + goto out; + + for (i = 0; i < count; i++) { + lk_buf = dht_lock_asprintf(lk_array[i]); + if (!lk_buf) + goto out; + + gf_smsg(name, log_level, 0, DHT_MSG_LK_ARRAY_INFO, "index=%d", i, + "lk_buf=%s", lk_buf, NULL); + GF_FREE(lk_buf); + } + +out: + return; +} + +static void +dht_lock_stack_destroy(call_frame_t *lock_frame, dht_lock_type_t lk) +{ + dht_local_t *local = NULL; + + local = lock_frame->local; + + if (lk == DHT_INODELK) { + local->lock[0].layout.my_layout.locks = NULL; + local->lock[0].layout.my_layout.lk_count = 0; + } else { + local->lock[0].ns.directory_ns.locks = NULL; + local->lock[0].ns.directory_ns.lk_count = 0; + } + + DHT_STACK_DESTROY(lock_frame); + return; +} + +static void +dht_lock_free(dht_lock_t *lock) +{ + if (lock == NULL) + goto out; + + loc_wipe(&lock->loc); + GF_FREE(lock->domain); + GF_FREE(lock->basename); + mem_put(lock); + +out: + return; +} + +static void +dht_set_lkowner(dht_lock_t **lk_array, int count, gf_lkowner_t *lkowner) +{ + int i = 0; + + if (!lk_array || !lkowner) + goto out; + + for (i = 0; i < count; i++) { + lk_array[i]->lk_owner = *lkowner; + } + +out: + return; +} + +static int +dht_lock_request_cmp(const void *val1, const void *val2) +{ + dht_lock_t *lock1 = NULL; + dht_lock_t *lock2 = NULL; + int ret = -1; + + lock1 = *(dht_lock_t **)val1; + lock2 = *(dht_lock_t **)val2; + + GF_VALIDATE_OR_GOTO("dht-locks", lock1, out); + GF_VALIDATE_OR_GOTO("dht-locks", lock2, out); + + ret = strcmp(lock1->xl->name, lock2->xl->name); + + if (ret == 0) { + ret = gf_uuid_compare(lock1->loc.gfid, lock2->loc.gfid); + } + +out: + return ret; +} + +static int +dht_lock_order_requests(dht_lock_t **locks, int count) +{ + int ret = -1; + + if (!locks || !count) + goto out; + + qsort(locks, count, sizeof(*locks), dht_lock_request_cmp); + ret = 0; + +out: + return ret; +} + +void +dht_lock_array_free(dht_lock_t **lk_array, int count) +{ + int i = 0; + dht_lock_t *lock = NULL; + + if (lk_array == NULL) + goto out; + + for (i = 0; i < count; i++) { + lock = lk_array[i]; + lk_array[i] = NULL; + dht_lock_free(lock); + } + +out: + return; +} + +int32_t +dht_lock_count(dht_lock_t **lk_array, int lk_count) +{ + int i = 0, locked = 0; + + if ((lk_array == NULL) || (lk_count == 0)) + goto out; + + for (i = 0; i < lk_count; i++) { + if (lk_array[i]->locked) + locked++; + } +out: + return locked; +} + +static call_frame_t * +dht_lock_frame(call_frame_t *parent_frame) +{ + call_frame_t *lock_frame = NULL; + + lock_frame = copy_frame(parent_frame); + if (lock_frame == NULL) + goto out; + + set_lk_owner_from_ptr(&lock_frame->root->lk_owner, parent_frame->root); + +out: + return lock_frame; +} + +dht_lock_t * +dht_lock_new(xlator_t *this, xlator_t *xl, loc_t *loc, short type, + const char *domain, const char *basename, + dht_reaction_type_t do_on_failure) +{ + dht_conf_t *conf = NULL; + dht_lock_t *lock = NULL; + + conf = this->private; + + lock = mem_get0(conf->lock_pool); + if (lock == NULL) + goto out; + + lock->xl = xl; + lock->type = type; + lock->do_on_failure = do_on_failure; + + lock->domain = gf_strdup(domain); + if (lock->domain == NULL) { + dht_lock_free(lock); + lock = NULL; + goto out; + } + + if (basename) { + lock->basename = gf_strdup(basename); + if (lock->basename == NULL) { + dht_lock_free(lock); + lock = NULL; + goto out; + } + } + + /* Fill only inode and gfid. + posix and protocol/server give preference to pargfid/basename over + gfid/inode for resolution if all the three parameters of loc_t are + present. I want to avoid the following hypothetical situation: + + 1. rebalance did a lookup on a dentry and got a gfid. + 2. rebalance acquires lock on loc_t which was filled with gfid and + path (pargfid/bname) from step 1. + 3. somebody deleted and recreated the same file + 4. rename on the same path acquires lock on loc_t which now points + to a different inode (and hence gets the lock). + 5. rebalance continues to migrate file (note that not all fops done + by rebalance during migration are inode/gfid based Eg., unlink) + 6. rename continues. + */ + lock->loc.inode = inode_ref(loc->inode); + loc_gfid(loc, lock->loc.gfid); + +out: + return lock; +} + +static int +dht_local_entrylk_init(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_entrylk_cbk_t entrylk_cbk) +{ + int ret = -1; + dht_local_t *local = NULL; + + local = frame->local; + + if (local == NULL) { + local = dht_local_init(frame, NULL, NULL, 0); + } + + if (local == NULL) { + goto out; + } + + local->lock[0].ns.directory_ns.entrylk_cbk = entrylk_cbk; + local->lock[0].ns.directory_ns.locks = lk_array; + local->lock[0].ns.directory_ns.lk_count = lk_count; + + ret = dht_lock_order_requests(local->lock[0].ns.directory_ns.locks, + local->lock[0].ns.directory_ns.lk_count); + if (ret < 0) + goto out; + + ret = 0; +out: + return ret; +} + +static void +dht_entrylk_done(call_frame_t *lock_frame) +{ + fop_entrylk_cbk_t entrylk_cbk = NULL; + call_frame_t *main_frame = NULL; + dht_local_t *local = NULL; + + local = lock_frame->local; + main_frame = local->main_frame; + + local->lock[0].ns.directory_ns.locks = NULL; + local->lock[0].ns.directory_ns.lk_count = 0; + + entrylk_cbk = local->lock[0].ns.directory_ns.entrylk_cbk; + local->lock[0].ns.directory_ns.entrylk_cbk = NULL; + + entrylk_cbk(main_frame, NULL, main_frame->this, + local->lock[0].ns.directory_ns.op_ret, + local->lock[0].ns.directory_ns.op_errno, NULL); + + dht_lock_stack_destroy(lock_frame, DHT_ENTRYLK); + return; +} + +static int32_t +dht_unlock_entrylk_done(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + gf_uuid_unparse(local->lock[0].ns.directory_ns.locks[0]->loc.inode->gfid, + gfid); + + if (op_ret < 0) { + gf_smsg(this->name, GF_LOG_WARNING, op_errno, + DHT_MSG_UNLOCK_GFID_FAILED, "gfid=%s", gfid, + "DHT_LAYOUT_HEAL_DOMAIN", NULL); + } + + DHT_STACK_DESTROY(frame); + return 0; +} + +static int32_t +dht_unlock_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int lk_index = 0, call_cnt = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + lk_index = (long)cookie; + + local = frame->local; + + uuid_utoa_r(local->lock[0].ns.directory_ns.locks[lk_index]->loc.gfid, gfid); + + if (op_ret < 0) { + gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLOCKING_FAILED, + "name=%s", + local->lock[0].ns.directory_ns.locks[lk_index]->xl->name, + "gfid=%s", gfid, NULL); + } else { + local->lock[0].ns.directory_ns.locks[lk_index]->locked = 0; + } + + call_cnt = dht_frame_return(frame); + if (is_last_call(call_cnt)) { + dht_entrylk_done(frame); + } + + return 0; +} + +static int32_t +dht_unlock_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_entrylk_cbk_t entrylk_cbk) +{ + dht_local_t *local = NULL; + int ret = -1, i = 0; + call_frame_t *lock_frame = NULL; + int call_cnt = 0; + + GF_VALIDATE_OR_GOTO("dht-locks", frame, done); + GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, done); + GF_VALIDATE_OR_GOTO(frame->this->name, entrylk_cbk, done); + + call_cnt = dht_lock_count(lk_array, lk_count); + if (call_cnt == 0) { + ret = 0; + goto done; + } + + lock_frame = dht_lock_frame(frame); + if (lock_frame == NULL) { + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS, + NULL); + + dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count); + goto done; + } + + ret = dht_local_entrylk_init(lock_frame, lk_array, lk_count, entrylk_cbk); + if (ret < 0) { + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK, + NULL); + + dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count); + + goto done; + } + + local = lock_frame->local; + local->main_frame = frame; + local->call_cnt = call_cnt; + + for (i = 0; i < local->lock[0].ns.directory_ns.lk_count; i++) { + if (!local->lock[0].ns.directory_ns.locks[i]->locked) + continue; + + lock_frame->root + ->lk_owner = local->lock[0].ns.directory_ns.locks[i]->lk_owner; + STACK_WIND_COOKIE( + lock_frame, dht_unlock_entrylk_cbk, (void *)(long)i, + local->lock[0].ns.directory_ns.locks[i]->xl, + local->lock[0].ns.directory_ns.locks[i]->xl->fops->entrylk, + local->lock[0].ns.directory_ns.locks[i]->domain, + &local->lock[0].ns.directory_ns.locks[i]->loc, + local->lock[0].ns.directory_ns.locks[i]->basename, ENTRYLK_UNLOCK, + ENTRYLK_WRLCK, NULL); + if (!--call_cnt) + break; + } + + return 0; + +done: + if (lock_frame) + dht_lock_stack_destroy(lock_frame, DHT_ENTRYLK); + + /* no locks acquired, invoke entrylk_cbk */ + if (ret == 0) + entrylk_cbk(frame, NULL, frame->this, 0, 0, NULL); + + return ret; +} + +int32_t +dht_unlock_entrylk_wrapper(call_frame_t *frame, dht_elock_wrap_t *entrylk) +{ + dht_local_t *local = NULL, *lock_local = NULL; + call_frame_t *lock_frame = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + int ret = 0; + + local = frame->local; + + if (!entrylk || !entrylk->locks) + goto out; + + gf_uuid_unparse(local->loc.parent->gfid, pgfid); + + lock_frame = copy_frame(frame); + if (lock_frame == NULL) { + gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM, + DHT_MSG_COPY_FRAME_FAILED, "pgfid=%s", pgfid, "name=%s", + local->loc.name, "path=%s", local->loc.path, NULL); + goto done; + } + + lock_local = dht_local_init(lock_frame, NULL, NULL, 0); + if (lock_local == NULL) { + gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM, + DHT_MSG_CREATE_FAILED, "local", "pgfid=%s", pgfid, "name=%s", + local->loc.name, "path=%s", local->loc.path, NULL); + goto done; + } + + lock_frame->local = lock_local; + + lock_local->lock[0].ns.directory_ns.locks = entrylk->locks; + lock_local->lock[0].ns.directory_ns.lk_count = entrylk->lk_count; + entrylk->locks = NULL; + entrylk->lk_count = 0; + + ret = dht_unlock_entrylk( + lock_frame, lock_local->lock[0].ns.directory_ns.locks, + lock_local->lock[0].ns.directory_ns.lk_count, dht_unlock_entrylk_done); + if (ret) + goto done; + + lock_frame = NULL; + +done: + if (lock_frame != NULL) { + DHT_STACK_DESTROY(lock_frame); + } + +out: + return 0; +} + +static int +dht_entrylk_cleanup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_entrylk_done(frame); + return 0; +} + +static void +dht_entrylk_cleanup(call_frame_t *lock_frame) +{ + dht_lock_t **lk_array = NULL; + int lk_count = 0, lk_acquired = 0; + dht_local_t *local = NULL; + + local = lock_frame->local; + + lk_array = local->lock[0].ns.directory_ns.locks; + lk_count = local->lock[0].ns.directory_ns.lk_count; + + lk_acquired = dht_lock_count(lk_array, lk_count); + if (lk_acquired != 0) { + dht_unlock_entrylk(lock_frame, lk_array, lk_count, + dht_entrylk_cleanup_cbk); + } else { + dht_entrylk_done(lock_frame); + } + + return; +} + +static int32_t +dht_blocking_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + int lk_index = 0; + int i = 0; + dht_local_t *local = NULL; + + lk_index = (long)cookie; + + local = frame->local; + if (op_ret == 0) { + local->lock[0].ns.directory_ns.locks[lk_index]->locked = _gf_true; + } else { + switch (op_errno) { + case ESTALE: + case ENOENT: + if (local->lock[0] + .ns.directory_ns.locks[lk_index] + ->do_on_failure != IGNORE_ENOENT_ESTALE) { + local->lock[0].ns.directory_ns.op_ret = -1; + local->lock[0].ns.directory_ns.op_errno = op_errno; + goto cleanup; + } + break; + default: + local->lock[0].ns.directory_ns.op_ret = -1; + local->lock[0].ns.directory_ns.op_errno = op_errno; + goto cleanup; + } + } + + if (lk_index == (local->lock[0].ns.directory_ns.lk_count - 1)) { + for (i = 0; (i < local->lock[0].ns.directory_ns.lk_count) && + (!local->lock[0].ns.directory_ns.locks[i]->locked); + i++) + ; + + if (i == local->lock[0].ns.directory_ns.lk_count) { + local->lock[0].ns.directory_ns.op_ret = -1; + local->lock[0].ns.directory_ns.op_errno = op_errno; + } + + dht_entrylk_done(frame); + } else { + dht_blocking_entrylk_rec(frame, ++lk_index); + } + + return 0; + +cleanup: + dht_entrylk_cleanup(frame); + + return 0; +} + +void +dht_blocking_entrylk_rec(call_frame_t *frame, int i) +{ + dht_local_t *local = NULL; + + local = frame->local; + + STACK_WIND_COOKIE( + frame, dht_blocking_entrylk_cbk, (void *)(long)i, + local->lock[0].ns.directory_ns.locks[i]->xl, + local->lock[0].ns.directory_ns.locks[i]->xl->fops->entrylk, + local->lock[0].ns.directory_ns.locks[i]->domain, + &local->lock[0].ns.directory_ns.locks[i]->loc, + local->lock[0].ns.directory_ns.locks[i]->basename, ENTRYLK_LOCK, + ENTRYLK_WRLCK, NULL); + + return; +} + +int +dht_blocking_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_entrylk_cbk_t entrylk_cbk) +{ + int ret = -1; + call_frame_t *lock_frame = NULL; + dht_local_t *local = NULL; + + GF_VALIDATE_OR_GOTO("dht-locks", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, out); + GF_VALIDATE_OR_GOTO(frame->this->name, entrylk_cbk, out); + + lock_frame = dht_lock_frame(frame); + if (lock_frame == NULL) + goto out; + + ret = dht_local_entrylk_init(lock_frame, lk_array, lk_count, entrylk_cbk); + if (ret < 0) { + goto out; + } + + dht_set_lkowner(lk_array, lk_count, &lock_frame->root->lk_owner); + + local = lock_frame->local; + local->main_frame = frame; + + dht_blocking_entrylk_rec(lock_frame, 0); + + return 0; +out: + if (lock_frame) + dht_lock_stack_destroy(lock_frame, DHT_ENTRYLK); + + return -1; +} + +static int +dht_local_inodelk_init(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_inodelk_cbk_t inodelk_cbk) +{ + int ret = -1; + dht_local_t *local = NULL; + + local = frame->local; + + if (local == NULL) { + local = dht_local_init(frame, NULL, NULL, 0); + } + + if (local == NULL) { + goto out; + } + + local->lock[0].layout.my_layout.inodelk_cbk = inodelk_cbk; + local->lock[0].layout.my_layout.locks = lk_array; + local->lock[0].layout.my_layout.lk_count = lk_count; + + ret = dht_lock_order_requests(local->lock[0].layout.my_layout.locks, + local->lock[0].layout.my_layout.lk_count); + if (ret < 0) + goto out; + + ret = 0; +out: + return ret; +} + +static void +dht_inodelk_done(call_frame_t *lock_frame) +{ + fop_inodelk_cbk_t inodelk_cbk = NULL; + call_frame_t *main_frame = NULL; + dht_local_t *local = NULL; + + local = lock_frame->local; + main_frame = local->main_frame; + + local->lock[0].layout.my_layout.locks = NULL; + local->lock[0].layout.my_layout.lk_count = 0; + + inodelk_cbk = local->lock[0].layout.my_layout.inodelk_cbk; + local->lock[0].layout.my_layout.inodelk_cbk = NULL; + + inodelk_cbk(main_frame, NULL, main_frame->this, + local->lock[0].layout.my_layout.op_ret, + local->lock[0].layout.my_layout.op_errno, NULL); + + dht_lock_stack_destroy(lock_frame, DHT_INODELK); + return; +} + +static int32_t +dht_unlock_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int lk_index = 0, call_cnt = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + lk_index = (long)cookie; + + local = frame->local; + if (op_ret < 0) { + uuid_utoa_r(local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid, + gfid); + + gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLOCKING_FAILED, + "name=%s", + local->lock[0].layout.my_layout.locks[lk_index]->xl->name, + "gfid=%s", gfid, NULL); + } else { + local->lock[0].layout.my_layout.locks[lk_index]->locked = 0; + } + + call_cnt = dht_frame_return(frame); + if (is_last_call(call_cnt)) { + dht_inodelk_done(frame); + } + + return 0; +} + +static int32_t +dht_unlock_inodelk_done(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + gf_uuid_unparse(local->lock[0].layout.my_layout.locks[0]->loc.inode->gfid, + gfid); + + if (op_ret < 0) { + gf_smsg(this->name, GF_LOG_WARNING, op_errno, + DHT_MSG_UNLOCK_GFID_FAILED, "DHT_LAYOUT_HEAL_DOMAIN gfid=%s", + gfid, NULL); + } + + DHT_STACK_DESTROY(frame); + return 0; +} + +int32_t +dht_unlock_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_inodelk_cbk_t inodelk_cbk) +{ + dht_local_t *local = NULL; + struct gf_flock flock = { + 0, + }; + int ret = -1, i = 0; + call_frame_t *lock_frame = NULL; + int call_cnt = 0; + + GF_VALIDATE_OR_GOTO("dht-locks", frame, done); + GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, done); + GF_VALIDATE_OR_GOTO(frame->this->name, inodelk_cbk, done); + + call_cnt = dht_lock_count(lk_array, lk_count); + if (call_cnt == 0) { + ret = 0; + goto done; + } + + lock_frame = dht_lock_frame(frame); + if (lock_frame == NULL) { + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS, + NULL); + + dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count); + goto done; + } + + ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk); + if (ret < 0) { + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK, + NULL); + + dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count); + + goto done; + } + + local = lock_frame->local; + local->main_frame = frame; + local->call_cnt = call_cnt; + + flock.l_type = F_UNLCK; + + for (i = 0; i < local->lock[0].layout.my_layout.lk_count; i++) { + if (!local->lock[0].layout.my_layout.locks[i]->locked) + continue; + + lock_frame->root + ->lk_owner = local->lock[0].layout.my_layout.locks[i]->lk_owner; + STACK_WIND_COOKIE( + lock_frame, dht_unlock_inodelk_cbk, (void *)(long)i, + local->lock[0].layout.my_layout.locks[i]->xl, + local->lock[0].layout.my_layout.locks[i]->xl->fops->inodelk, + local->lock[0].layout.my_layout.locks[i]->domain, + &local->lock[0].layout.my_layout.locks[i]->loc, F_SETLK, &flock, + NULL); + if (!--call_cnt) + break; + } + + return 0; + +done: + if (lock_frame) + dht_lock_stack_destroy(lock_frame, DHT_INODELK); + + /* no locks acquired, invoke inodelk_cbk */ + if (ret == 0) + inodelk_cbk(frame, NULL, frame->this, 0, 0, NULL); + + return ret; +} + +int32_t +dht_unlock_inodelk_wrapper(call_frame_t *frame, dht_ilock_wrap_t *inodelk) +{ + dht_local_t *local = NULL, *lock_local = NULL; + call_frame_t *lock_frame = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + int ret = 0; + + local = frame->local; + + if (!inodelk || !inodelk->locks) + goto out; + + gf_uuid_unparse(local->loc.parent->gfid, pgfid); + + lock_frame = copy_frame(frame); + if (lock_frame == NULL) { + gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM, + DHT_MSG_COPY_FRAME_FAILED, "pgfid=%s", pgfid, "name=%s", + local->loc.name, "path=%s", local->loc.path, NULL); + goto done; + } + + lock_local = dht_local_init(lock_frame, NULL, NULL, 0); + if (lock_local == NULL) { + gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM, + DHT_MSG_CREATE_FAILED, "local", "gfid=%s", pgfid, "name=%s", + local->loc.name, "path=%s", local->loc.path, NULL); + goto done; + } + + lock_frame->local = lock_local; + + lock_local->lock[0].layout.my_layout.locks = inodelk->locks; + lock_local->lock[0].layout.my_layout.lk_count = inodelk->lk_count; + inodelk->locks = NULL; + inodelk->lk_count = 0; + + ret = dht_unlock_inodelk( + lock_frame, lock_local->lock[0].layout.my_layout.locks, + lock_local->lock[0].layout.my_layout.lk_count, dht_unlock_inodelk_done); + + if (ret) + goto done; + + lock_frame = NULL; + +done: + if (lock_frame != NULL) { + DHT_STACK_DESTROY(lock_frame); + } +out: + return 0; +} + +static int +dht_inodelk_cleanup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_inodelk_done(frame); + return 0; +} + +static void +dht_inodelk_cleanup(call_frame_t *lock_frame) +{ + dht_lock_t **lk_array = NULL; + int lk_count = 0, lk_acquired = 0; + dht_local_t *local = NULL; + + local = lock_frame->local; + + lk_array = local->lock[0].layout.my_layout.locks; + lk_count = local->lock[0].layout.my_layout.lk_count; + + lk_acquired = dht_lock_count(lk_array, lk_count); + if (lk_acquired != 0) { + dht_unlock_inodelk(lock_frame, lk_array, lk_count, + dht_inodelk_cleanup_cbk); + } else { + dht_inodelk_done(lock_frame); + } + + return; +} + +static int32_t +dht_nonblocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int lk_index = 0, call_cnt = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + lk_index = (long)cookie; + + if (op_ret == -1) { + local->lock[0].layout.my_layout.op_ret = -1; + local->lock[0].layout.my_layout.op_errno = op_errno; + + if (local && local->lock[0].layout.my_layout.locks[lk_index]) { + uuid_utoa_r(local->lock[0] + .layout.my_layout.locks[lk_index] + ->loc.inode->gfid, + gfid); + + gf_msg_debug( + this->name, op_errno, + "inodelk failed on gfid: %s " + "subvolume: %s", + gfid, + local->lock[0].layout.my_layout.locks[lk_index]->xl->name); + } + + goto out; + } + + local->lock[0].layout.my_layout.locks[lk_index]->locked = _gf_true; + +out: + call_cnt = dht_frame_return(frame); + if (is_last_call(call_cnt)) { + if (local->lock[0].layout.my_layout.op_ret < 0) { + dht_inodelk_cleanup(frame); + return 0; + } + + dht_inodelk_done(frame); + } + + return 0; +} + +int +dht_nonblocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array, + int lk_count, fop_inodelk_cbk_t inodelk_cbk) +{ + struct gf_flock flock = { + 0, + }; + int i = 0, ret = 0; + dht_local_t *local = NULL; + call_frame_t *lock_frame = NULL; + + GF_VALIDATE_OR_GOTO("dht-locks", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, out); + GF_VALIDATE_OR_GOTO(frame->this->name, inodelk_cbk, out); + + lock_frame = dht_lock_frame(frame); + if (lock_frame == NULL) + goto out; + + ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk); + if (ret < 0) { + goto out; + } + + dht_set_lkowner(lk_array, lk_count, &lock_frame->root->lk_owner); + + local = lock_frame->local; + local->main_frame = frame; + + local->call_cnt = lk_count; + + for (i = 0; i < lk_count; i++) { + flock.l_type = local->lock[0].layout.my_layout.locks[i]->type; + + STACK_WIND_COOKIE( + lock_frame, dht_nonblocking_inodelk_cbk, (void *)(long)i, + local->lock[0].layout.my_layout.locks[i]->xl, + local->lock[0].layout.my_layout.locks[i]->xl->fops->inodelk, + local->lock[0].layout.my_layout.locks[i]->domain, + &local->lock[0].layout.my_layout.locks[i]->loc, F_SETLK, &flock, + NULL); + } + + return 0; + +out: + if (lock_frame) + dht_lock_stack_destroy(lock_frame, DHT_INODELK); + + return -1; +} + +static int32_t +dht_blocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + int lk_index = 0; + int i = 0; + dht_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = { + 0, + }; + dht_reaction_type_t reaction = 0; + + lk_index = (long)cookie; + + local = frame->local; + if (op_ret == 0) { + local->lock[0].layout.my_layout.locks[lk_index]->locked = _gf_true; + } else { + switch (op_errno) { + case ESTALE: + case ENOENT: + reaction = local->lock[0] + .layout.my_layout.locks[lk_index] + ->do_on_failure; + if ((reaction != IGNORE_ENOENT_ESTALE) && + (reaction != IGNORE_ENOENT_ESTALE_EIO)) { + gf_uuid_unparse(local->lock[0] + .layout.my_layout.locks[lk_index] + ->loc.gfid, + gfid); + local->lock[0].layout.my_layout.op_ret = -1; + local->lock[0].layout.my_layout.op_errno = op_errno; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_INODELK_FAILED, "subvol=%s", + local->lock[0] + .layout.my_layout.locks[lk_index] + ->xl->name, + "gfid=%s", gfid, NULL); + goto cleanup; + } + break; + case EIO: + reaction = local->lock[0] + .layout.my_layout.locks[lk_index] + ->do_on_failure; + if (reaction != IGNORE_ENOENT_ESTALE_EIO) { + gf_uuid_unparse(local->lock[0] + .layout.my_layout.locks[lk_index] + ->loc.gfid, + gfid); + local->lock[0].layout.my_layout.op_ret = -1; + local->lock[0].layout.my_layout.op_errno = op_errno; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_INODELK_FAILED, "subvol=%s", + local->lock[0] + .layout.my_layout.locks[lk_index] + ->xl->name, + "gfid=%s", gfid, NULL); + goto cleanup; + } + break; + + default: + gf_uuid_unparse( + local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid, + gfid); + local->lock[0].layout.my_layout.op_ret = -1; + local->lock[0].layout.my_layout.op_errno = op_errno; + gf_smsg( + this->name, GF_LOG_ERROR, op_errno, DHT_MSG_INODELK_FAILED, + "subvol=%s", + local->lock[0].layout.my_layout.locks[lk_index]->xl->name, + "gfid=%s", gfid, NULL); + goto cleanup; + } + } + + if (lk_index == (local->lock[0].layout.my_layout.lk_count - 1)) { + for (i = 0; (i < local->lock[0].layout.my_layout.lk_count) && + (!local->lock[0].layout.my_layout.locks[i]->locked); + i++) + ; + + if (i == local->lock[0].layout.my_layout.lk_count) { + local->lock[0].layout.my_layout.op_ret = -1; + local->lock[0].layout.my_layout.op_errno = op_errno; + } + + dht_inodelk_done(frame); + } else { + dht_blocking_inodelk_rec(frame, ++lk_index); + } + + return 0; + +cleanup: + dht_inodelk_cleanup(frame); + + return 0; +} + +void +dht_blocking_inodelk_rec(call_frame_t *frame, int i) +{ + dht_local_t *local = NULL; + struct gf_flock flock = { + 0, + }; + + local = frame->local; + + flock.l_type = local->lock[0].layout.my_layout.locks[i]->type; + + STACK_WIND_COOKIE( + frame, dht_blocking_inodelk_cbk, (void *)(long)i, + local->lock[0].layout.my_layout.locks[i]->xl, + local->lock[0].layout.my_layout.locks[i]->xl->fops->inodelk, + local->lock[0].layout.my_layout.locks[i]->domain, + &local->lock[0].layout.my_layout.locks[i]->loc, F_SETLKW, &flock, NULL); + + return; +} + +int +dht_blocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_inodelk_cbk_t inodelk_cbk) +{ + int ret = -1; + call_frame_t *lock_frame = NULL; + dht_local_t *local = NULL; + dht_local_t *tmp_local = NULL; + char gfid[GF_UUID_BUF_SIZE] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("dht-locks", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, out); + GF_VALIDATE_OR_GOTO(frame->this->name, inodelk_cbk, out); + + tmp_local = frame->local; + + lock_frame = dht_lock_frame(frame); + if (lock_frame == NULL) { + gf_uuid_unparse(tmp_local->loc.gfid, gfid); + gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_LOCK_FRAME_FAILED, + "gfid=%s", gfid, "path=%s", tmp_local->loc.path, NULL); + goto out; + } + + ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk); + if (ret < 0) { + gf_uuid_unparse(tmp_local->loc.gfid, gfid); + gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_LOCAL_LOCK_INIT_FAILED, + "gfid=%s", gfid, "path=%s", tmp_local->loc.path, NULL); + goto out; + } + + dht_set_lkowner(lk_array, lk_count, &lock_frame->root->lk_owner); + + local = lock_frame->local; + local->main_frame = frame; + + dht_blocking_inodelk_rec(lock_frame, 0); + + return 0; +out: + if (lock_frame) + dht_lock_stack_destroy(lock_frame, DHT_INODELK); + + return -1; +} + +void +dht_unlock_namespace(call_frame_t *frame, dht_dir_transaction_t *lock) +{ + GF_VALIDATE_OR_GOTO("dht-locks", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, lock, out); + + dht_unlock_entrylk_wrapper(frame, &lock->ns.directory_ns); + dht_unlock_inodelk_wrapper(frame, &lock->ns.parent_layout); + +out: + return; +} + +static int32_t +dht_protect_namespace_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + + local = frame->local; + if (op_ret != 0) + dht_unlock_inodelk_wrapper(frame, &local->current->ns.parent_layout); + + local->current->ns.ns_cbk(frame, cookie, this, op_ret, op_errno, xdata); + return 0; +} + +int32_t +dht_blocking_entrylk_after_inodelk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int ret = -1; + loc_t *loc = NULL; + dht_lock_t **lk_array = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + int count = 0; + dht_elock_wrap_t *entrylk = NULL; + + local = frame->local; + entrylk = &local->current->ns.directory_ns; + + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; + goto err; + } + + loc = &entrylk->locks[0]->loc; + gf_uuid_unparse(loc->gfid, pgfid); + + local->op_ret = 0; + lk_array = entrylk->locks; + count = entrylk->lk_count; + + ret = dht_blocking_entrylk(frame, lk_array, count, + dht_protect_namespace_cbk); + + if (ret < 0) { + local->op_ret = -1; + local->op_errno = EIO; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_ENTRYLK_FAILED_AFT_INODELK, "fop=%s", + gf_fop_list[local->fop], "pgfid=%s", pgfid, "basename=%s", + entrylk->locks[0]->basename, NULL); + goto err; + } + + return 0; + +err: + if (lk_array != NULL) { + dht_lock_array_free(lk_array, count); + GF_FREE(lk_array); + entrylk->locks = NULL; + entrylk->lk_count = 0; + } + + /* Unlock inodelk. No harm calling unlock twice */ + dht_unlock_inodelk_wrapper(frame, &local->current->ns.parent_layout); + /* Call ns_cbk. It will take care of unwinding */ + local->current->ns.ns_cbk(frame, NULL, this, local->op_ret, local->op_errno, + NULL); + return 0; +} + +/* Given the loc and the subvol, this routine takes the inodelk on + * the parent inode and entrylk on (parent, loc->name). This routine + * is specific as it supports only one subvol on which it takes inodelk + * and then entrylk serially. + */ +int +dht_protect_namespace(call_frame_t *frame, loc_t *loc, xlator_t *subvol, + struct dht_namespace *ns, fop_entrylk_cbk_t ns_cbk) +{ + dht_ilock_wrap_t *inodelk = NULL; + dht_elock_wrap_t *entrylk = NULL; + dht_lock_t **lk_array = NULL; + dht_local_t *local = NULL; + xlator_t *this = NULL; + loc_t parent = { + 0, + }; + int ret = -1; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + int32_t op_errno = 0; + int count = 1; + + GF_VALIDATE_OR_GOTO("dht-locks", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, loc, out); + GF_VALIDATE_OR_GOTO(frame->this->name, loc->parent, out); + GF_VALIDATE_OR_GOTO(frame->this->name, subvol, out); + + local = frame->local; + this = frame->this; + + inodelk = &ns->parent_layout; + entrylk = &ns->directory_ns; + + /* Initialize entrylk_cbk and parent loc */ + ns->ns_cbk = ns_cbk; + + ret = dht_build_parent_loc(this, &parent, loc, &op_errno); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_LOC_FAILED, + "gfid=%s", loc->gfid, "name=%s", loc->name, "path=%s", + loc->path, NULL); + goto out; + } + gf_uuid_unparse(parent.gfid, pgfid); + + /* Alloc inodelk */ + inodelk->locks = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer); + if (inodelk->locks == NULL) { + local->op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_CALLOC_FAILED, "fop=%s", gf_fop_list[local->fop], + "pgfid=%s", pgfid, "name=%s", loc->name, "path=%s", loc->path, + NULL); + goto out; + } + + inodelk->locks[0] = dht_lock_new(this, subvol, &parent, F_RDLCK, + DHT_LAYOUT_HEAL_DOMAIN, NULL, + FAIL_ON_ANY_ERROR); + if (inodelk->locks[0] == NULL) { + local->op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_LOCK_ALLOC_FAILED, "inodelk-fop=%s", + gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s", + loc->name, "path=%s", loc->path, NULL); + goto err; + } + inodelk->lk_count = count; + + /* Allock entrylk */ + entrylk->locks = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer); + if (entrylk->locks == NULL) { + local->op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_CALLOC_FAILED, "entrylk-fop=%s", + gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s", + loc->name, "path=%s", loc->path, NULL); + + goto err; + } + + entrylk->locks[0] = dht_lock_new(this, subvol, &parent, F_WRLCK, + DHT_ENTRY_SYNC_DOMAIN, loc->name, + FAIL_ON_ANY_ERROR); + if (entrylk->locks[0] == NULL) { + local->op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_LOCK_ALLOC_FAILED, "entrylk-fop=%s", + gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s", + loc->name, "path=%s", loc->path, NULL); + + goto err; + } + entrylk->lk_count = count; + + /* Take read inodelk on parent. If it is successful, take write entrylk + * on name in cbk. + */ + lk_array = inodelk->locks; + ret = dht_blocking_inodelk(frame, lk_array, count, + dht_blocking_entrylk_after_inodelk); + if (ret < 0) { + local->op_errno = EIO; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_BLOCK_INODELK_FAILED, "fop=%s", gf_fop_list[local->fop], + "pgfid=%s", pgfid, "name=%s", loc->name, "path=%s", loc->path, + NULL); + + goto err; + } + + loc_wipe(&parent); + + return 0; +err: + if (entrylk->locks != NULL) { + dht_lock_array_free(entrylk->locks, count); + GF_FREE(entrylk->locks); + entrylk->locks = NULL; + entrylk->lk_count = 0; + } + + if (inodelk->locks != NULL) { + dht_lock_array_free(inodelk->locks, count); + GF_FREE(inodelk->locks); + inodelk->locks = NULL; + inodelk->lk_count = 0; + } + + loc_wipe(&parent); +out: + return -1; +} diff --git a/xlators/cluster/dht/src/dht-lock.h b/xlators/cluster/dht/src/dht-lock.h new file mode 100644 index 00000000000..6485c03fb6e --- /dev/null +++ b/xlators/cluster/dht/src/dht-lock.h @@ -0,0 +1,91 @@ +/* + Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _DHT_LOCK_H +#define _DHT_LOCK_H + +#include "dht-common.h" + +void +dht_lock_array_free(dht_lock_t **lk_array, int count); + +int32_t +dht_lock_count(dht_lock_t **lk_array, int lk_count); + +dht_lock_t * +dht_lock_new(xlator_t *this, xlator_t *xl, loc_t *loc, short type, + const char *domain, const char *basename, + dht_reaction_type_t do_on_failure); + +int32_t +dht_unlock_entrylk_wrapper(call_frame_t *, dht_elock_wrap_t *); + +void +dht_blocking_entrylk_rec(call_frame_t *frame, int i); + +int +dht_blocking_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_inodelk_cbk_t entrylk_cbk); + +int32_t +dht_unlock_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_inodelk_cbk_t inodelk_cbk); + +int32_t +dht_unlock_inodelk_wrapper(call_frame_t *, dht_ilock_wrap_t *); + +/* Acquire non-blocking inodelk on a list of xlators. + * + * @lk_array: array of lock requests lock on. + * + * @lk_count: number of locks in @lk_array + * + * @inodelk_cbk: will be called after inodelk replies are received + * + * @retval: -1 if stack_winding inodelk fails. 0 otherwise. + * inodelk_cbk is called with appropriate error on errors. + * On failure to acquire lock on all members of list, successful + * locks are unlocked before invoking cbk. + */ + +int +dht_nonblocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array, + int lk_count, fop_inodelk_cbk_t inodelk_cbk); + +void +dht_blocking_inodelk_rec(call_frame_t *frame, int i); + +/* same as dht_nonblocking_inodelk, but issues sequential blocking locks on + * @lk_array directly. locks are issued on some order which remains same + * for a list of xlators (irrespective of order of xlators within list). + */ + +int +dht_blocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_inodelk_cbk_t inodelk_cbk); + +int32_t +dht_blocking_entrylk_after_inodelk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata); + +int32_t +dht_blocking_entrylk_after_inodelk_rename(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata); + +void +dht_unlock_namespace(call_frame_t *, dht_dir_transaction_t *); + +int +dht_protect_namespace(call_frame_t *frame, loc_t *loc, xlator_t *subvol, + struct dht_namespace *ns, fop_entrylk_cbk_t ns_cbk); + +#endif /* _DHT_LOCK_H */ diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h index af31c8b0724..e3c4471334a 100644 --- a/xlators/cluster/dht/src/dht-mem-types.h +++ b/xlators/cluster/dht/src/dht-mem-types.h @@ -1,40 +1,38 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef __DHT_MEM_TYPES_H__ #define __DHT_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_dht_mem_types_ { - gf_dht_mt_dht_du_t = gf_common_mt_end + 1, - gf_dht_mt_dht_conf_t, - gf_dht_mt_char, - gf_dht_mt_int32_t, - gf_dht_mt_dht_local_t, - gf_dht_mt_xlator_t, - gf_dht_mt_dht_layout_t, - gf_switch_mt_dht_conf_t, - gf_switch_mt_dht_du_t, - gf_switch_mt_switch_sched_array, - gf_switch_mt_switch_struct, - gf_dht_mt_end + gf_dht_mt_dht_du_t = gf_common_mt_end + 1, + gf_dht_mt_dht_conf_t, + gf_dht_mt_char, + gf_dht_mt_int32_t, + gf_dht_mt_xlator_t, + gf_dht_mt_dht_layout_t, + gf_switch_mt_switch_sched_array, + gf_switch_mt_switch_struct, + gf_dht_mt_subvol_time, + gf_dht_mt_loc_t, + gf_defrag_info_mt, + gf_dht_mt_inode_ctx_t, + gf_dht_mt_dirent_t, + gf_dht_mt_container_t, + gf_dht_mt_octx_t, + gf_dht_mt_miginfo_t, + gf_dht_mt_fd_ctx_t, + gf_dht_ret_cache_t, + gf_dht_nodeuuids_t, + gf_dht_mt_end }; #endif diff --git a/xlators/cluster/dht/src/dht-messages.h b/xlators/cluster/dht/src/dht-messages.h new file mode 100644 index 00000000000..601f8dad78b --- /dev/null +++ b/xlators/cluster/dht/src/dht-messages.h @@ -0,0 +1,386 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _DHT_MESSAGES_H_ +#define _DHT_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID( + DHT, DHT_MSG_CACHED_SUBVOL_GET_FAILED, DHT_MSG_CREATE_LINK_FAILED, + DHT_MSG_DICT_SET_FAILED, DHT_MSG_DIR_ATTR_HEAL_FAILED, + DHT_MSG_DIR_SELFHEAL_FAILED, DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, + DHT_MSG_FILE_ON_MULT_SUBVOL, DHT_MSG_FILE_TYPE_MISMATCH, + DHT_MSG_GFID_MISMATCH, DHT_MSG_GFID_NULL, DHT_MSG_HASHED_SUBVOL_GET_FAILED, + DHT_MSG_INIT_FAILED, DHT_MSG_INVALID_CONFIGURATION, + DHT_MSG_INVALID_DISK_LAYOUT, DHT_MSG_INVALID_OPTION, + DHT_MSG_LAYOUT_FIX_FAILED, DHT_MSG_LAYOUT_MERGE_FAILED, + DHT_MSG_LAYOUT_MISMATCH, DHT_MSG_LAYOUT_NULL, DHT_MSG_MIGRATE_DATA_COMPLETE, + DHT_MSG_MIGRATE_DATA_FAILED, DHT_MSG_MIGRATE_FILE_COMPLETE, + DHT_MSG_MIGRATE_FILE_FAILED, DHT_MSG_NO_MEMORY, DHT_MSG_OPENDIR_FAILED, + DHT_MSG_REBALANCE_FAILED, DHT_MSG_REBALANCE_START_FAILED, + DHT_MSG_REBALANCE_STATUS, DHT_MSG_REBALANCE_STOPPED, DHT_MSG_RENAME_FAILED, + DHT_MSG_SETATTR_FAILED, DHT_MSG_SUBVOL_INSUFF_INODES, + DHT_MSG_SUBVOL_INSUFF_SPACE, DHT_MSG_UNLINK_FAILED, + DHT_MSG_LAYOUT_SET_FAILED, DHT_MSG_LOG_FIXED_LAYOUT, + DHT_MSG_GET_XATTR_FAILED, DHT_MSG_FILE_LOOKUP_FAILED, + DHT_MSG_OPEN_FD_FAILED, DHT_MSG_SET_INODE_CTX_FAILED, + DHT_MSG_UNLOCKING_FAILED, DHT_MSG_DISK_LAYOUT_NULL, DHT_MSG_SUBVOL_INFO, + DHT_MSG_CHUNK_SIZE_INFO, DHT_MSG_LAYOUT_FORM_FAILED, DHT_MSG_SUBVOL_ERROR, + DHT_MSG_LAYOUT_SORT_FAILED, DHT_MSG_REGEX_INFO, DHT_MSG_FOPEN_FAILED, + DHT_MSG_SET_HOSTNAME_FAILED, DHT_MSG_BRICK_ERROR, DHT_MSG_SYNCOP_FAILED, + DHT_MSG_MIGRATE_INFO, DHT_MSG_SOCKET_ERROR, DHT_MSG_CREATE_FD_FAILED, + DHT_MSG_READDIR_ERROR, DHT_MSG_CHILD_LOC_BUILD_FAILED, + DHT_MSG_SET_SWITCH_PATTERN_ERROR, DHT_MSG_COMPUTE_HASH_FAILED, + DHT_MSG_FIND_LAYOUT_ANOMALIES_ERROR, DHT_MSG_ANOMALIES_INFO, + DHT_MSG_LAYOUT_INFO, DHT_MSG_INODE_LK_ERROR, DHT_MSG_RENAME_INFO, + DHT_MSG_DATA_NULL, DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED, + DHT_MSG_UNLINK_LOOKUP_INFO, DHT_MSG_LINK_FILE_LOOKUP_INFO, + DHT_MSG_OPERATION_NOT_SUP, DHT_MSG_NOT_LINK_FILE_ERROR, DHT_MSG_CHILD_DOWN, + DHT_MSG_UUID_PARSE_ERROR, DHT_MSG_GET_DISK_INFO_ERROR, + DHT_MSG_INVALID_VALUE, DHT_MSG_SWITCH_PATTERN_INFO, + DHT_MSG_SUBVOL_OP_FAILED, DHT_MSG_LAYOUT_PRESET_FAILED, + DHT_MSG_INVALID_LINKFILE, DHT_MSG_FIX_LAYOUT_INFO, + DHT_MSG_GET_HOSTNAME_FAILED, DHT_MSG_WRITE_FAILED, + DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED, DHT_MSG_FSYNC_FAILED, + DHT_MSG_SUBVOL_DECOMMISSION_INFO, DHT_MSG_BRICK_QUERY_FAILED, + DHT_MSG_SUBVOL_NO_LAYOUT_INFO, DHT_MSG_OPEN_FD_ON_DST_FAILED, + DHT_MSG_SUBVOL_NOT_FOUND, DHT_MSG_FILE_LOOKUP_ON_DST_FAILED, + DHT_MSG_DISK_LAYOUT_MISSING, DHT_MSG_DICT_GET_FAILED, + DHT_MSG_REVALIDATE_CBK_INFO, DHT_MSG_UPGRADE_BRICKS, DHT_MSG_LK_ARRAY_INFO, + DHT_MSG_RENAME_NOT_LOCAL, DHT_MSG_RECONFIGURE_INFO, + DHT_MSG_INIT_LOCAL_SUBVOL_FAILED, DHT_MSG_SYS_CALL_GET_TIME_FAILED, + DHT_MSG_NO_DISK_USAGE_STATUS, DHT_MSG_SUBVOL_DOWN_ERROR, + DHT_MSG_REBAL_THROTTLE_INFO, DHT_MSG_COMMIT_HASH_INFO, + DHT_MSG_REBAL_STRUCT_SET, DHT_MSG_HAS_MIGINFO, DHT_MSG_SETTLE_HASH_FAILED, + DHT_MSG_DEFRAG_PROCESS_DIR_FAILED, DHT_MSG_FD_CTX_SET_FAILED, + DHT_MSG_STALE_LOOKUP, DHT_MSG_PARENT_LAYOUT_CHANGED, + DHT_MSG_LOCK_MIGRATION_FAILED, DHT_MSG_LOCK_INODE_UNREF_FAILED, + DHT_MSG_ASPRINTF_FAILED, DHT_MSG_DIR_LOOKUP_FAILED, DHT_MSG_INODELK_FAILED, + DHT_MSG_LOCK_FRAME_FAILED, DHT_MSG_LOCAL_LOCK_INIT_FAILED, + DHT_MSG_ENTRYLK_ERROR, DHT_MSG_INODELK_ERROR, DHT_MSG_LOC_FAILED, + DHT_MSG_UNKNOWN_FOP, DHT_MSG_MIGRATE_FILE_SKIPPED, + DHT_MSG_DIR_XATTR_HEAL_FAILED, DHT_MSG_HASHED_SUBVOL_DOWN, + DHT_MSG_NON_HASHED_SUBVOL_DOWN, DHT_MSG_SYNCTASK_CREATE_FAILED, + DHT_MSG_DIR_HEAL_ABORT, DHT_MSG_MIGRATE_SKIP, DHT_MSG_FD_CREATE_FAILED, + DHT_MSG_DICT_NEW_FAILED, DHT_MSG_FAILED_TO_OPEN, DHT_MSG_CREATE_FAILED, + DHT_MSG_FILE_NOT_EXIST, DHT_MSG_CHOWN_FAILED, DHT_MSG_FALLOCATE_FAILED, + DHT_MSG_FTRUNCATE_FAILED, DHT_MSG_STATFS_FAILED, DHT_MSG_WRITE_CROSS, + DHT_MSG_NEW_TARGET_FOUND, DHT_MSG_INSUFF_MEMORY, DHT_MSG_SET_XATTR_FAILED, + DHT_MSG_SET_MODE_FAILED, DHT_MSG_FILE_EXISTS_IN_DEST, + DHT_MSG_SYMLINK_FAILED, DHT_MSG_LINKFILE_DEL_FAILED, DHT_MSG_MKNOD_FAILED, + DHT_MSG_MIGRATE_CLEANUP_FAILED, DHT_MSG_LOCK_MIGRATE, + DHT_MSG_PARENT_BUILD_FAILED, DHT_MSG_HASHED_SUBVOL_NOT_FOUND, + DHT_MSG_ACQUIRE_ENTRYLK_FAILED, DHT_MSG_CREATE_DST_FAILED, + DHT_MSG_MIGRATION_EXIT, DHT_MSG_CHANGED_DST, DHT_MSG_TRACE_FAILED, + DHT_MSG_WRITE_LOCK_FAILED, DHT_MSG_GETACTIVELK_FAILED, DHT_MSG_STAT_FAILED, + DHT_MSG_UNLINK_PERFORM_FAILED, DHT_MSG_CLANUP_SOURCE_FILE_FAILED, + DHT_MSG_UNLOCK_FILE_FAILED, DHT_MSG_REMOVE_XATTR_FAILED, + DHT_MSG_DATA_MIGRATE_ABORT, DHT_MSG_DEFRAG_NULL, DHT_MSG_PARENT_NULL, + DHT_MSG_GFID_NOT_PRESENT, DHT_MSG_CHILD_LOC_FAILED, + DHT_MSG_SET_LOOKUP_FAILED, DHT_MSG_DIR_REMOVED, DHT_MSG_FIX_NOT_COMP, + DHT_MSG_SUBVOL_DETER_FAILED, DHT_MSG_LOCAL_SUBVOL, DHT_MSG_NODE_UUID, + DHT_MSG_SIZE_FILE, DHT_MSG_GET_DATA_SIZE_FAILED, + DHT_MSG_PTHREAD_JOIN_FAILED, DHT_MSG_COUNTER_THREAD_CREATE_FAILED, + DHT_MSG_MIGRATION_INIT_QUEUE_FAILED, DHT_MSG_PAUSED_TIMEOUT, DHT_MSG_WOKE, + DHT_MSG_ABORT_REBALANCE, DHT_MSG_CREATE_TASK_REBAL_FAILED, + DHT_MSG_REBAL_ESTIMATE_NOT_AVAIL, DHT_MSG_ADD_CHOICES_ERROR, + DHT_MSG_GET_CHOICES_ERROR, DHT_MSG_PREPARE_STATUS_ERROR, + DHT_MSG_SET_CHOICE_FAILED, DHT_MSG_SET_HASHED_SUBVOL_FAILED, + DHT_MSG_XATTR_HEAL_NOT_POSS, DHT_MSG_LINKTO_FILE_FAILED, + DHT_MSG_STALE_LINKFILE_DELETE, DHT_MSG_NO_SUBVOL_FOR_LINKTO, + DHT_MSG_SUBVOL_RETURNED, DHT_MSG_UNKNOWN_LOCAL_XSEL, DHT_MSG_GET_XATTR_ERR, + DHT_MSG_ALLOC_OR_FILL_FAILED, DHT_MSG_GET_REAL_NAME_FAILED, + DHT_MSG_COPY_UUID_FAILED, DHT_MSG_MDS_DETER_FAILED, + DHT_MSG_CREATE_REBAL_FAILED, DHT_MSG_LINK_LAYOUT_FAILED, + DHT_MSG_NO_SUBVOL_IN_LAYOUT, DHT_MSG_MEM_ALLOC_FAILED, + DHT_MSG_SET_IN_PARAMS_DICT_FAILED, DHT_MSG_LOC_COPY_FAILED, + DHT_MSG_PARENT_LOC_FAILED, DHT_MSG_CREATE_LOCK_FAILED, + DHT_MSG_PREV_ATTEMPT_FAILED, DHT_MSG_REFRESH_ATTEMPT, + DHT_MSG_ACQUIRE_LOCK_FAILED, DHT_MSG_CREATE_STUB_FAILED, + DHT_MSG_WIND_LOCK_REQ_FAILED, DHT_MSG_REFRESH_FAILED, + DHT_MSG_CACHED_SUBVOL_ERROR, DHT_MSG_NO_LINK_SUBVOL, DHT_MSG_SET_KEY_FAILED, + DHT_MSG_REMOVE_LINKTO_FAILED, DHT_MSG_LAYOUT_DICT_SET_FAILED, + DHT_MSG_XATTR_DICT_NULL, DHT_MSG_DUMMY_ALLOC_FAILED, DHT_MSG_DICT_IS_NULL, + DHT_MSG_LINK_INODE_FAILED, DHT_MSG_SELFHEAL_FAILED, DHT_MSG_NO_MDS_SUBVOL, + DHT_MSG_LIST_XATTRS_FAILED, DHT_MSG_RESET_INTER_XATTR_FAILED, + DHT_MSG_MDS_DOWN_UNABLE_TO_SET, DHT_MSG_WIND_UNLOCK_FAILED, + DHT_MSG_COMMIT_HASH_FAILED, DHT_MSG_UNLOCK_GFID_FAILED, + DHT_MSG_UNLOCK_FOLLOW_ENTRYLK, DHT_MSG_COPY_FRAME_FAILED, + DHT_MSG_UNLOCK_FOLLOW_LOCKS, DHT_MSG_ENTRYLK_FAILED_AFT_INODELK, + DHT_MSG_CALLOC_FAILED, DHT_MSG_LOCK_ALLOC_FAILED, + DHT_MSG_BLOCK_INODELK_FAILED, + DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK, + DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS, + DHT_MSG_DST_NULL_SET_FAILED); + +#define DHT_MSG_FD_CTX_SET_FAILED_STR "Failed to set fd ctx" +#define DHT_MSG_INVALID_VALUE_STR "Different dst found in the fd ctx" +#define DHT_MSG_UNKNOWN_FOP_STR "Unknown FOP on file" +#define DHT_MSG_OPEN_FD_ON_DST_FAILED_STR "Failed to open the fd on file" +#define DHT_MSG_SYNCTASK_CREATE_FAILED_STR "Failed to create synctask" +#define DHT_MSG_ASPRINTF_FAILED_STR \ + "asprintf failed while fetching subvol from the id" +#define DHT_MSG_HAS_MIGINFO_STR "Found miginfo in the inode ctx" +#define DHT_MSG_FILE_LOOKUP_FAILED_STR "failed to lookup the file" +#define DHT_MSG_INVALID_LINKFILE_STR \ + "linkto target is different from cached-subvol. treating as destination " \ + "subvol" +#define DHT_MSG_GFID_MISMATCH_STR "gfid different on the target file" +#define DHT_MSG_GET_XATTR_FAILED_STR "failed to get 'linkto' xattr" +#define DHT_MSG_SET_INODE_CTX_FAILED_STR "failed to set inode-ctx target file" +#define DHT_MSG_DIR_SELFHEAL_FAILED_STR "Healing of path failed" +#define DHT_MSG_DIR_HEAL_ABORT_STR \ + "Failed to get path from subvol. Aborting directory healing" +#define DHT_MSG_DIR_XATTR_HEAL_FAILED_STR "xattr heal failed for directory" +#define DHT_MSG_LOCK_INODE_UNREF_FAILED_STR \ + "Found a NULL inode. Failed to unref the inode" +#define DHT_MSG_DICT_SET_FAILED_STR "Failed to set dictionary value" +#define DHT_MSG_NOT_LINK_FILE_ERROR_STR "got non-linkfile" +#define DHT_MSG_CREATE_LINK_FAILED_STR "failed to initialize linkfile data" +#define DHT_MSG_UNLINK_FAILED_STR "Unlinking linkfile on subvolume failed" +#define DHT_MSG_MIGRATE_FILE_FAILED_STR "Migrate file failed" +#define DHT_MSG_NO_MEMORY_STR "could not allocate memory for dict" +#define DHT_MSG_SUBVOL_ERROR_STR "Failed to get linkto subvol" +#define DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED_STR "link failed on subvol" +#define DHT_MSG_MIGRATE_FILE_SKIPPED_STR "Migration skipped" +#define DHT_MSG_FD_CREATE_FAILED_STR "fd create failed" +#define DHT_MSG_DICT_NEW_FAILED_STR "dict_new failed" +#define DHT_MSG_FAILED_TO_OPEN_STR "failed to open" +#define DHT_MSG_CREATE_FAILED_STR "failed to create" +#define DHT_MSG_FILE_NOT_EXIST_STR "file does not exist" +#define DHT_MSG_CHOWN_FAILED_STR "chown failed" +#define DHT_MSG_FALLOCATE_FAILED_STR "fallocate failed" +#define DHT_MSG_FTRUNCATE_FAILED_STR "ftruncate failed" +#define DHT_MSG_STATFS_FAILED_STR "failed to get statfs" +#define DHT_MSG_WRITE_CROSS_STR \ + "write will cross min-fre-disk for file on subvol. looking for new subvol" +#define DHT_MSG_SUBVOL_INSUFF_SPACE_STR \ + "Could not find any subvol with space accommodating the file. Cosider " \ + "adding bricks" +#define DHT_MSG_NEW_TARGET_FOUND_STR "New target found for file" +#define DHT_MSG_INSUFF_MEMORY_STR "insufficient memory" +#define DHT_MSG_SET_XATTR_FAILED_STR "failed to set xattr" +#define DHT_MSG_SET_MODE_FAILED_STR "failed to set mode" +#define DHT_MSG_FILE_EXISTS_IN_DEST_STR "file exists in destination" +#define DHT_MSG_LINKFILE_DEL_FAILED_STR "failed to delete the linkfile" +#define DHT_MSG_SYMLINK_FAILED_STR "symlink failed" +#define DHT_MSG_MKNOD_FAILED_STR "mknod failed" +#define DHT_MSG_SETATTR_FAILED_STR "failed to perform setattr" +#define DHT_MSG_MIGRATE_CLEANUP_FAILED_STR \ + "Migrate file cleanup failed: failed to fstat file" +#define DHT_MSG_LOCK_MIGRATE_STR "locks will be migrated for file" +#define DHT_MSG_PARENT_BUILD_FAILED_STR \ + "failed to build parent loc, which is needed to acquire entrylk to " \ + "synchronize with renames on this path. Skipping migration" +#define DHT_MSG_HASHED_SUBVOL_NOT_FOUND_STR \ + "cannot find hashed subvol which is needed to synchronize with renames " \ + "on this path. Skipping migration" +#define DHT_MSG_ACQUIRE_ENTRYLK_FAILED_STR "failed to acquire entrylk on subvol" +#define DHT_MSG_CREATE_DST_FAILED_STR "create dst failed for file" +#define DHT_MSG_MIGRATION_EXIT_STR "Exiting migration" +#define DHT_MSG_CHANGED_DST_STR "destination changed fo file" +#define DHT_MSG_TRACE_FAILED_STR "Trace failed" +#define DHT_MSG_WRITE_LOCK_FAILED_STR "write lock failed" +#define DHT_MSG_GETACTIVELK_FAILED_STR "getactivelk failed for file" +#define DHT_MSG_STAT_FAILED_STR "failed to do a stat" +#define DHT_MSG_UNLINK_PERFORM_FAILED_STR "failed to perform unlink" +#define DHT_MSG_MIGRATE_FILE_COMPLETE_STR "completed migration" +#define DHT_MSG_CLANUP_SOURCE_FILE_FAILED_STR "failed to cleanup source file" +#define DHT_MSG_UNLOCK_FILE_FAILED_STR "failed to unlock file" +#define DHT_MSG_REMOVE_XATTR_FAILED_STR "remove xattr failed" +#define DHT_MSG_SOCKET_ERROR_STR "Failed to unlink listener socket" +#define DHT_MSG_HASHED_SUBVOL_GET_FAILED_STR "Failed to get hashed subvolume" +#define DHT_MSG_CACHED_SUBVOL_GET_FAILED_STR "Failed to get cached subvolume" +#define DHT_MSG_MIGRATE_DATA_FAILED_STR "migrate-data failed" +#define DHT_MSG_DEFRAG_NULL_STR "defrag is NULL" +#define DHT_MSG_DATA_MIGRATE_ABORT_STR \ + "Readdirp failed. Aborting data migration for dict" +#define DHT_MSG_LAYOUT_FIX_FAILED_STR "fix layout failed" +#define DHT_MSG_PARENT_NULL_STR "parent is NULL" +#define DHT_MSG_GFID_NOT_PRESENT_STR "gfid not present" +#define DHT_MSG_CHILD_LOC_FAILED_STR "Child loc build failed" +#define DHT_MSG_SET_LOOKUP_FAILED_STR "Failed to set lookup" +#define DHT_MSG_DIR_LOOKUP_FAILED_STR "lookup failed" +#define DHT_MSG_DIR_REMOVED_STR "Dir renamed or removed. Skipping" +#define DHT_MSG_READDIR_ERROR_STR "readdir failed, Aborting fix-layout" +#define DHT_MSG_SETTLE_HASH_FAILED_STR "Settle hash failed" +#define DHT_MSG_DEFRAG_PROCESS_DIR_FAILED_STR "gf_defrag_process_dir failed" +#define DHT_MSG_FIX_NOT_COMP_STR \ + "Unable to retrieve fixlayout xattr. Assume background fix layout not " \ + "complete" +#define DHT_MSG_SUBVOL_DETER_FAILED_STR \ + "local subvolume determination failed with error" +#define DHT_MSG_LOCAL_SUBVOL_STR "local subvol" +#define DHT_MSG_NODE_UUID_STR "node uuid" +#define DHT_MSG_SIZE_FILE_STR "Total size files" +#define DHT_MSG_GET_DATA_SIZE_FAILED_STR \ + "Failed to get the total data size. Unable to estimate time to complete " \ + "rebalance" +#define DHT_MSG_PTHREAD_JOIN_FAILED_STR \ + "file_counter_thread: pthread_join failed" +#define DHT_MSG_COUNTER_THREAD_CREATE_FAILED_STR \ + "Failed to create the file counter thread" +#define DHT_MSG_MIGRATION_INIT_QUEUE_FAILED_STR \ + "Failed to initialise migration queue" +#define DHT_MSG_REBALANCE_STOPPED_STR "Received stop command on rebalance" +#define DHT_MSG_PAUSED_TIMEOUT_STR "Request pause timer timeout" +#define DHT_MSG_WOKE_STR "woken" +#define DHT_MSG_ABORT_REBALANCE_STR "Aborting rebalance" +#define DHT_MSG_REBALANCE_START_FAILED_STR \ + "Failed to start rebalance: look up on / failed" +#define DHT_MSG_CREATE_TASK_REBAL_FAILED_STR \ + "Could not create task for rebalance" +#define DHT_MSG_REBAL_ESTIMATE_NOT_AVAIL_STR \ + "Rebalance estimates will not be available" +#define DHT_MSG_REBALANCE_STATUS_STR "Rebalance status" +#define DHT_MSG_DATA_NULL_STR "data value is NULL" +#define DHT_MSG_ADD_CHOICES_ERROR_STR "Error to add choices in buffer" +#define DHT_MSG_GET_CHOICES_ERROR_STR "Error to get choices" +#define DHT_MSG_PREPARE_STATUS_ERROR_STR "Error to prepare status" +#define DHT_MSG_SET_CHOICE_FAILED_STR "Failed to set full choice" +#define DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED_STR \ + "Failed to aggregate quota xattr" +#define DHT_MSG_FILE_TYPE_MISMATCH_STR \ + "path exists as a file on one subvolume and directory on another. Please " \ + "fix it manually" +#define DHT_MSG_LAYOUT_SET_FAILED_STR "failed to set layout for subvolume" +#define DHT_MSG_LAYOUT_MERGE_FAILED_STR "failed to merge layouts for subvolume" +#define DHT_MSG_SET_HASHED_SUBVOL_FAILED_STR "Failed to set hashed subvolume" +#define DHT_MSG_XATTR_HEAL_NOT_POSS_STR \ + "No gfid exists for path. so healing xattr is not possible" +#define DHT_MSG_REVALIDATE_CBK_INFO_STR "Revalidate: subvolume returned -1" +#define DHT_MSG_LAYOUT_MISMATCH_STR "Mismatching layouts" +#define DHT_MSG_UNLINK_LOOKUP_INFO_STR "lookup_unlink retuened" +#define DHT_MSG_LINKTO_FILE_FAILED_STR \ + "Could not unlink the linkto file as either fd is open and/or linkto " \ + "xattr is set" +#define DHT_MSG_LAYOUT_PRESET_FAILED_STR \ + "Could not set pre-set layout for subvolume" +#define DHT_MSG_FILE_ON_MULT_SUBVOL_STR \ + "multiple subvolumes have file (preferably rename the file in the " \ + "backend, and do a fresh lookup" +#define DHT_MSG_STALE_LINKFILE_DELETE_STR \ + "attempting deletion of stale linkfile" +#define DHT_MSG_LINK_FILE_LOOKUP_INFO_STR "Lookup on following linkfile" +#define DHT_MSG_NO_SUBVOL_FOR_LINKTO_STR "No link subvolume for linkto" +#define DHT_MSG_SUBVOL_RETURNED_STR "Subvolume returned -1" +#define DHT_MSG_UNKNOWN_LOCAL_XSEL_STR "Unknown local->xsel" +#define DHT_MSG_DICT_GET_FAILED_STR "Failed to get" +#define DHT_MSG_UUID_PARSE_ERROR_STR "Failed to parse uuid" +#define DHT_MSG_GET_XATTR_ERR_STR "getxattr err for dir" +#define DHT_MSG_ALLOC_OR_FILL_FAILED_STR "alloc or fill failed" +#define DHT_MSG_UPGRADE_BRICKS_STR \ + "At least one of the bricks does not support this operation. Please " \ + "upgrade all bricks" +#define DHT_MSG_GET_REAL_NAME_FAILED_STR "Failed to get real filename" +#define DHT_MSG_LAYOUT_NULL_STR "Layout is NULL" +#define DHT_MSG_COPY_UUID_FAILED_STR "Failed to copy node uuid key" +#define DHT_MSG_MDS_DETER_FAILED_STR \ + "Cannot determine MDS, fetching xattr randomly from a subvol" +#define DHT_MSG_HASHED_SUBVOL_DOWN_STR \ + "MDS is down for path, so fetching xattr randomly from subvol" +#define DHT_MSG_CREATE_REBAL_FAILED_STR \ + "failed to create a new rebalance synctask" +#define DHT_MSG_FIX_LAYOUT_INFO_STR "fixing the layout" +#define DHT_MSG_OPERATION_NOT_SUP_STR "wrong directory-spread-count value" +#define DHT_MSG_LINK_LAYOUT_FAILED_STR "failed to link the layout in inode" +#define DHT_MSG_NO_SUBVOL_IN_LAYOUT_STR "no subvolume in layout for path" +#define DHT_MSG_INODE_LK_ERROR_STR "mknod lock failed for file" +#define DHT_MSG_MEM_ALLOC_FAILED_STR "mem allocation failed" +#define DHT_MSG_PARENT_LAYOUT_CHANGED_STR \ + "extracting in-memory layout of parent failed" +#define DHT_MSG_SET_IN_PARAMS_DICT_FAILED_STR \ + "setting in params dictionary failed" +#define DHT_MSG_LOC_COPY_FAILED_STR "loc_copy failed" +#define DHT_MSG_LOC_FAILED_STR "parent loc build failed" +#define DHT_MSG_PARENT_LOC_FAILED_STR "locking parent failed" +#define DHT_MSG_CREATE_LOCK_FAILED_STR "Create lock failed" +#define DHT_MSG_PREV_ATTEMPT_FAILED_STR \ + "mkdir loop detected. parent layout didn't change even though previous " \ + "attempt of mkdir failed because of in-memory layout not matching with " \ + "that on disk." +#define DHT_MSG_REFRESH_ATTEMPT_STR \ + "mkdir parent layout changed. Attempting a refresh and then a retry" +#define DHT_MSG_ACQUIRE_LOCK_FAILED_STR \ + "Acquiring lock on parent to guard against layout-change failed" +#define DHT_MSG_CREATE_STUB_FAILED_STR "creating stub failed" +#define DHT_MSG_WIND_LOCK_REQ_FAILED_STR \ + "cannot wind lock request to guard parent layout" +#define DHT_MSG_REFRESH_FAILED_STR "refreshing parent layout failed." +#define DHT_MSG_CACHED_SUBVOL_ERROR_STR "On cached subvol" +#define DHT_MSG_NO_LINK_SUBVOL_STR "Linkfile does not have link subvolume" +#define DHT_MSG_SET_KEY_FAILED_STR "failed to set key" +#define DHT_MSG_CHILD_DOWN_STR "Received CHILD_DOWN. Exiting" +#define DHT_MSG_LOG_FIXED_LAYOUT_STR "log layout fixed" +#define DHT_MSG_REBAL_STRUCT_SET_STR "local->rebalance already set" +#define DHT_MSG_REMOVE_LINKTO_FAILED_STR "Removal of linkto failed at subvol" +#define DHT_MSG_LAYOUT_DICT_SET_FAILED_STR "dht layout dict set failed" +#define DHT_MSG_SUBVOL_INFO_STR "creating subvolume" +#define DHT_MSG_COMPUTE_HASH_FAILED_STR "hash computation failed" +#define DHT_MSG_INVALID_DISK_LAYOUT_STR \ + "Invalid disk layout: Catastrophic error layout with unknown type found" +#define DHT_MSG_LAYOUT_SORT_FAILED_STR "layout sort failed" +#define DHT_MSG_ANOMALIES_INFO_STR "Found anomalies" +#define DHT_MSG_XATTR_DICT_NULL_STR "xattr dictionary is NULL" +#define DHT_MSG_DISK_LAYOUT_MISSING_STR "Disk layout missing" +#define DHT_MSG_LAYOUT_INFO_STR "layout info" +#define DHT_MSG_SUBVOL_NO_LAYOUT_INFO_STR "no pre-set layout for subvol" +#define DHT_MSG_SELFHEAL_XATTR_FAILED_STR "layout setxattr failed" +#define DHT_MSG_DIR_SELFHEAL_XATTR_FAILED_STR "Directory self heal xattr failed" +#define DHT_MSG_DUMMY_ALLOC_FAILED_STR "failed to allocate dummy layout" +#define DHT_MSG_DICT_IS_NULL_STR \ + "dict is NULL, need to make sure gfids are same" +#define DHT_MSG_ENTRYLK_ERROR_STR "acquiring entrylk after inodelk failed" +#define DHT_MSG_NO_DISK_USAGE_STATUS_STR "no du stats" +#define DHT_MSG_LINK_INODE_FAILED_STR "linking inode failed" +#define DHT_MSG_SELFHEAL_FAILED_STR "Directory selfheal failed" +#define DHT_MSG_NO_MDS_SUBVOL_STR "No mds subvol" +#define DHT_MSG_LIST_XATTRS_FAILED_STR "failed to list xattrs" +#define DHT_MSG_RESET_INTER_XATTR_FAILED_STR "Failed to reset internal xattr" +#define DHT_MSG_MDS_DOWN_UNABLE_TO_SET_STR \ + "mds subvol is down, unable to set xattr" +#define DHT_MSG_DIR_ATTR_HEAL_FAILED_STR \ + "Directory attr heal failed. Failed to set uid/gid" +#define DHT_MSG_WIND_UNLOCK_FAILED_STR \ + "Winding unlock failed: stale locks left on brick" +#define DHT_MSG_COMMIT_HASH_FAILED_STR "Directory commit hash updaten failed" +#define DHT_MSG_LK_ARRAY_INFO_STR "lk info" +#define DHT_MSG_UNLOCK_GFID_FAILED_STR \ + "unlock failed on gfid: stale lock might be left" +#define DHT_MSG_UNLOCKING_FAILED_STR "unlocking failed" +#define DHT_MSG_UNLOCK_FOLLOW_ENTRYLK_STR "not unlocking following entrylks" +#define DHT_MSG_COPY_FRAME_FAILED_STR "copy frame failed" +#define DHT_MSG_UNLOCK_FOLLOW_LOCKS_STR "not unlocking following locks" +#define DHT_MSG_INODELK_FAILED_STR "inodelk failed on subvol" +#define DHT_MSG_LOCK_FRAME_FAILED_STR "memory allocation failed for lock_frame" +#define DHT_MSG_LOCAL_LOCK_INIT_FAILED_STR "dht_local_lock_init failed" +#define DHT_MSG_ENTRYLK_FAILED_AFT_INODELK_STR \ + "dht_blocking_entrylk failed after taking inodelk" +#define DHT_MSG_BLOCK_INODELK_FAILED_STR "dht_blocking_inodelk failed" +#define DHT_MSG_CALLOC_FAILED_STR "calloc failed" +#define DHT_MSG_LOCK_ALLOC_FAILED_STR "lock allocation failed" +#define DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS_STR \ + "cannot allocate a frame, not unlocking following entrylks" +#define DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK_STR \ + "storing locks in local failed, not unlocking following entrylks" +#define DHT_MSG_DST_NULL_SET_FAILED_STR \ + "src or dst is NULL, Failed to set dictionary value" + +#endif /* _DHT_MESSAGES_H_ */ diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c new file mode 100644 index 00000000000..8ba8082bd86 --- /dev/null +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -0,0 +1,4702 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "dht-common.h" +#include <glusterfs/syscall.h> +#include <fnmatch.h> +#include <signal.h> +#include <glusterfs/events.h> +#include "glusterfs/compat-errno.h" // for ENODATA on BSD + +#define GF_DISK_SECTOR_SIZE 512 +#define DHT_REBALANCE_PID 4242 /* Change it if required */ +#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */ +#define MAX_MIGRATE_QUEUE_COUNT 500 +#define MIN_MIGRATE_QUEUE_COUNT 200 +#define MAX_REBAL_TYPE_SIZE 16 +#define FILE_CNT_INTERVAL 600 /* 10 mins */ +#define ESTIMATE_START_INTERVAL 600 /* 10 mins */ +#define HARDLINK_MIG_INPROGRESS -2 +#define SKIP_MIGRATION_FD_POSITIVE -3 +#ifndef MAX +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) +#endif + +#define GF_CRAWL_INDEX_MOVE(idx, sv_cnt) \ + { \ + idx++; \ + idx %= sv_cnt; \ + } + +uint64_t g_totalfiles = 0; +uint64_t g_totalsize = 0; + +void +gf_defrag_free_dir_dfmeta(struct dir_dfmeta *meta, int local_subvols_cnt) +{ + int i = 0; + + if (meta) { + for (i = 0; i < local_subvols_cnt; i++) { + if (meta->equeue) + gf_dirent_free(&meta->equeue[i]); + if (meta->lfd && meta->lfd[i]) + fd_unref(meta->lfd[i]); + } + + GF_FREE(meta->equeue); + GF_FREE(meta->head); + GF_FREE(meta->iterator); + GF_FREE(meta->offset_var); + GF_FREE(meta->fetch_entries); + GF_FREE(meta->lfd); + GF_FREE(meta); + } +} + +void +gf_defrag_free_container(struct dht_container *container) +{ + if (container) { + gf_dirent_entry_free(container->df_entry); + + if (container->parent_loc) { + loc_wipe(container->parent_loc); + } + + GF_FREE(container->parent_loc); + + GF_FREE(container); + } +} + +void +dht_set_global_defrag_error(gf_defrag_info_t *defrag, int ret) +{ + LOCK(&defrag->lock); + { + defrag->global_error = ret; + } + UNLOCK(&defrag->lock); + return; +} + +static int +dht_send_rebalance_event(xlator_t *this, int cmd, gf_defrag_status_t status) +{ + int ret = -1; + char *volname = NULL; + char *tmpstr = NULL; + char *ptr = NULL; + char *suffix = "-dht"; + int len = 0; + + eventtypes_t event = EVENT_LAST; + + switch (status) { + case GF_DEFRAG_STATUS_COMPLETE: + event = EVENT_VOLUME_REBALANCE_COMPLETE; + break; + case GF_DEFRAG_STATUS_FAILED: + event = EVENT_VOLUME_REBALANCE_FAILED; + break; + case GF_DEFRAG_STATUS_STOPPED: + event = EVENT_VOLUME_REBALANCE_STOP; + break; + default: + break; + } + + /* DHT volume */ + len = strlen(this->name) - strlen(suffix); + tmpstr = gf_strdup(this->name); + if (tmpstr) { + ptr = tmpstr + len; + if (!strcmp(ptr, suffix)) { + tmpstr[len] = '\0'; + volname = tmpstr; + } + } + + if (!volname) { + /* Better than nothing */ + volname = this->name; + } + + if (event != EVENT_LAST) { + gf_event(event, "volume=%s", volname); + } + + GF_FREE(tmpstr); + return ret; +} + +static void +dht_strip_out_acls(dict_t *dict) +{ + if (dict) { + dict_del(dict, "trusted.SGI_ACL_FILE"); + dict_del(dict, POSIX_ACL_ACCESS_XATTR); + } +} + +/* + return values: + -1 : failure + -2 : success + +Hard link migration is carried out in three stages. + +(Say there are n hardlinks) +Stage 1: Setting the new hashed subvol information on the 1st hardlink + encountered (linkto setxattr) + +Stage 2: Creating hardlinks on new hashed subvol for the 2nd to (n-1)th + hardlink + +Stage 3: Physical migration of the data file for nth hardlink + +Why to deem "-2" as success and not "0": + + dht_migrate_file expects return value "0" from _is_file_migratable if +the file has to be migrated. + + _is_file_migratable returns zero only when it is called with the +flag "GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS". + + gf_defrag_handle_hardlink calls dht_migrate_file for physical migration +of the data file with the flag "GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS" + +Hence, gf_defrag_handle_hardlink returning "0" for success will force +"dht_migrate_file" to migrate each of the hardlink which is not intended. + +For each of the three stage mentioned above "-2" will be returned and will +be converted to "0" in dht_migrate_file. + +*/ + +int32_t +gf_defrag_handle_hardlink(xlator_t *this, loc_t *loc, int *fop_errno) +{ + int32_t ret = -1; + xlator_t *cached_subvol = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *linkto_subvol = NULL; + data_t *data = NULL; + struct iatt iatt = { + 0, + }; + int32_t op_errno = 0; + dht_conf_t *conf = NULL; + gf_loglevel_t loglevel = 0; + dict_t *link_xattr = NULL; + dict_t *dict = NULL; + dict_t *xattr_rsp = NULL; + struct iatt stbuf = { + 0, + }; + + *fop_errno = EINVAL; + + GF_VALIDATE_OR_GOTO("defrag", loc, out); + GF_VALIDATE_OR_GOTO("defrag", loc->name, out); + GF_VALIDATE_OR_GOTO("defrag", this, out); + GF_VALIDATE_OR_GOTO("defrag", this->private, out); + + conf = this->private; + + if (gf_uuid_is_null(loc->pargfid)) { + gf_msg("", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed :" + "loc->pargfid is NULL for %s", + loc->path); + *fop_errno = EINVAL; + ret = -1; + goto out; + } + + if (gf_uuid_is_null(loc->gfid)) { + gf_msg("", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed :" + "loc->gfid is NULL for %s", + loc->path); + *fop_errno = EINVAL; + ret = -1; + goto out; + } + + link_xattr = dict_new(); + if (!link_xattr) { + ret = -1; + *fop_errno = ENOMEM; + goto out; + } + + /* + Parallel migration can lead to migration of the hard link multiple + times which can lead to data loss. Hence, adding a fresh lookup to + decide whether migration is required or not. + + Elaborating the scenario for let say 10 hardlinks [link{1..10}]: + Let say the first hard link "link1" does the setxattr of the + new hashed subvolume info on the cached file. As there are multiple + threads working, we might have already all the links created on the + new hashed by the time we reach hardlink let say link5. Now the + number of links on hashed is equal to that of cached. Hence, file + migration will happen for link6. + + Cached Hashed + --------T link6 rwxrwxrwx link6 + + Now post above state all the link file on the cached will be zero + byte linkto files. Hence, if we still do migration for the following + files link{7..10}, we will end up migrating 0 data leading to data + loss. + Hence, a lookup can make sure whether we need to migrate the + file or not. + */ + + dict = dict_new(); + if (!dict) { + ret = -1; + *fop_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "could not allocate memory for dict"); + goto out; + } + + ret = dict_set_int32(dict, conf->link_xattr_name, 256); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: failed to set 'linkto' key in dict", + loc->path); + goto out; + } + + ret = syncop_lookup(this, loc, &stbuf, NULL, dict, &xattr_rsp); + if (ret) { + /*Ignore ENOENT and ESTALE as file might have been + migrated already*/ + if (-ret == ENOENT || -ret == ESTALE) { + ret = -2; + goto out; + } + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:%s lookup failed with ret = %d", loc->path, + ret); + *fop_errno = -ret; + ret = -1; + goto out; + } + + cached_subvol = dht_subvol_get_cached(this, loc->inode); + if (!cached_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed :" + "Failed to get cached subvol" + " for %s on %s", + loc->name, this->name); + *fop_errno = EINVAL; + ret = -1; + goto out; + } + + hashed_subvol = dht_subvol_get_hashed(this, loc); + if (!hashed_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed :" + "Failed to get hashed subvol" + " for %s on %s", + loc->name, this->name); + *fop_errno = EINVAL; + ret = -1; + goto out; + } + + /* Hardlink migration happens only with remove-brick. So this condition will + * be true only when the migration has happened. In case hardlinks are + * migrated for rebalance case, remove this check. Having this check here + * avoid redundant calls below*/ + if (hashed_subvol == cached_subvol) { + ret = -2; + goto out; + } + + gf_log(this->name, GF_LOG_INFO, + "Attempting to migrate hardlink %s " + "with gfid %s from %s -> %s", + loc->name, uuid_utoa(loc->gfid), cached_subvol->name, + hashed_subvol->name); + + data = dict_get(xattr_rsp, conf->link_xattr_name); + /* set linkto on cached -> hashed if not present, else link it */ + if (!data) { + ret = dict_set_str(link_xattr, conf->link_xattr_name, + hashed_subvol->name); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed :" + "Failed to set dictionary value:" + " key = %s for %s", + conf->link_xattr_name, loc->name); + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + + ret = syncop_setxattr(cached_subvol, loc, link_xattr, 0, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed :" + "Linkto setxattr failed %s -> %s", + cached_subvol->name, loc->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + gf_msg_debug(this->name, 0, + "hardlink target subvol created on %s " + ",cached %s, file %s", + hashed_subvol->name, cached_subvol->name, loc->path); + + ret = -2; + goto out; + } else { + linkto_subvol = dht_linkfile_subvol(this, NULL, NULL, xattr_rsp); + if (!linkto_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SUBVOL_ERROR, + "Failed to get " + "linkto subvol for %s", + loc->name); + } else { + hashed_subvol = linkto_subvol; + } + + ret = syncop_link(hashed_subvol, loc, loc, &iatt, NULL, NULL); + if (ret) { + op_errno = -ret; + ret = -1; + + loglevel = (op_errno == EEXIST) ? GF_LOG_DEBUG : GF_LOG_ERROR; + gf_msg(this->name, loglevel, op_errno, + DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED, + "link of %s -> %s" + " failed on subvol %s", + loc->name, uuid_utoa(loc->gfid), hashed_subvol->name); + if (op_errno != EEXIST) { + *fop_errno = op_errno; + goto out; + } + } else { + gf_msg_debug(this->name, 0, + "syncop_link successful for" + " hardlink %s on subvol %s, cached %s", + loc->path, hashed_subvol->name, cached_subvol->name); + } + } + + ret = syncop_lookup(hashed_subvol, loc, &iatt, NULL, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed :Failed lookup %s on %s ", loc->name, + hashed_subvol->name); + + *fop_errno = -ret; + ret = -1; + goto out; + } + + /* There is a race where on the target subvol for the hardlink + * (note: hash subvol for the hardlink might differ from this), some + * other client(non-rebalance) would have created a linkto file for that + * hardlink as part of lookup. So let say there are 10 hardlinks, on the + * 5th hardlink it self the hardlinks might have migrated. Now for + * (6..10th) hardlinks the cached and target would be same as the file + * has already migrated. Hence this check is needed */ + if (cached_subvol == hashed_subvol) { + gf_msg_debug(this->name, 0, + "source %s and destination %s " + "for hardlink %s are same", + cached_subvol->name, hashed_subvol->name, loc->path); + ret = -2; + goto out; + } + + if (iatt.ia_nlink == stbuf.ia_nlink) { + ret = dht_migrate_file(this, loc, cached_subvol, hashed_subvol, + GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS, fop_errno); + if (ret) { + goto out; + } + } + ret = -2; +out: + if (link_xattr) + dict_unref(link_xattr); + + if (xattr_rsp) + dict_unref(xattr_rsp); + + if (dict) + dict_unref(dict); + + return ret; +} + +static int +__check_file_has_hardlink(xlator_t *this, loc_t *loc, struct iatt *stbuf, + dict_t *xattrs, int flags, gf_defrag_info_t *defrag, + dht_conf_t *conf, int *fop_errno) +{ + int ret = 0; + + if (flags == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS) { + ret = 0; + return ret; + } + if (stbuf->ia_nlink > 1) { + /* support for decomission */ + if (flags == GF_DHT_MIGRATE_HARDLINK) { + synclock_lock(&conf->link_lock); + ret = gf_defrag_handle_hardlink(this, loc, fop_errno); + synclock_unlock(&conf->link_lock); + /* + Returning zero will force the file to be remigrated. + Checkout gf_defrag_handle_hardlink for more information. + */ + if (ret && ret != -2) { + gf_msg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: failed to migrate file with link", + loc->path); + } + } else { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migration skipped for:" + "%s: file has hardlinks", + loc->path); + *fop_errno = ENOTSUP; + ret = 1; + } + } + + return ret; +} + +/* + return values + 0 : File will be migrated + -2 : File will not be migrated + (This is the return value from gf_defrag_handle_hardlink. Checkout + gf_defrag_handle_hardlink for description of "returning -2") + -1 : failure +*/ +static int +__is_file_migratable(xlator_t *this, loc_t *loc, struct iatt *stbuf, + dict_t *xattrs, int flags, gf_defrag_info_t *defrag, + dht_conf_t *conf, int *fop_errno) +{ + int ret = -1; + int lock_count = 0; + + if (IA_ISDIR(stbuf->ia_type)) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: migrate-file called on directory", + loc->path); + *fop_errno = EISDIR; + ret = -1; + goto out; + } + + if (!conf->lock_migration_enabled) { + ret = dict_get_int32(xattrs, GLUSTERFS_POSIXLK_COUNT, &lock_count); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: Unable to get lock count for file", + loc->path); + *fop_errno = EINVAL; + ret = -1; + goto out; + } + + if (lock_count) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: %s: File has locks." + " Skipping file migration", + loc->path); + *fop_errno = ENOTSUP; + ret = 1; + goto out; + } + } + + /* Check if file has hardlink*/ + ret = __check_file_has_hardlink(this, loc, stbuf, xattrs, flags, defrag, + conf, fop_errno); +out: + return ret; +} + +static int +__dht_rebalance_create_dst_file(xlator_t *this, xlator_t *to, xlator_t *from, + loc_t *loc, struct iatt *stbuf, fd_t **dst_fd, + int *fop_errno, int file_has_holes) +{ + int ret = -1; + int ret2 = -1; + fd_t *fd = NULL; + struct iatt new_stbuf = { + 0, + }; + struct iatt check_stbuf = { + 0, + }; + dht_conf_t *conf = NULL; + dict_t *dict = NULL; + dict_t *xdata = NULL; + + conf = this->private; + + dict = dict_new(); + if (!dict) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "dictionary allocation failed for" + "path:%s", + loc->path); + goto out; + } + ret = dict_set_gfuuid(dict, "gfid-req", stbuf->ia_gfid, true); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "%s: failed to set dictionary value: key = gfid-req", loc->path); + goto out; + } + + ret = dict_set_str(dict, conf->link_xattr_name, from->name); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "%s: failed to set dictionary value: key = %s ", loc->path, + conf->link_xattr_name); + goto out; + } + + fd = fd_create(loc->inode, DHT_REBALANCE_PID); + if (!fd) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: fd create failed (destination)", loc->path); + goto out; + } + + xdata = dict_new(); + if (!xdata) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: dict_new failed)", loc->path); + goto out; + } + + ret = dict_set_int32_sizen(xdata, GF_CLEAN_WRITE_PROTECTION, 1); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "%s: failed to set dictionary value: key = %s ", loc->path, + GF_CLEAN_WRITE_PROTECTION); + goto out; + } + + ret = syncop_lookup(to, loc, &new_stbuf, NULL, xdata, NULL); + if (!ret) { + /* File exits in the destination, check if gfid matches */ + if (gf_uuid_compare(stbuf->ia_gfid, new_stbuf.ia_gfid) != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, + "file %s exists in %s with different gfid", loc->path, + to->name); + *fop_errno = EINVAL; + ret = -1; + goto out; + } + } + if ((ret < 0) && (-ret != ENOENT)) { + /* File exists in destination, but not accessible */ + gf_msg(THIS->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to lookup file", loc->path); + *fop_errno = -ret; + ret = -1; + goto out; + } + + /* Create the destination with LINKFILE mode, and linkto xattr, + if the linkfile already exists, just open the file */ + if (!ret) { + /* + * File already present, just open the file. + */ + ret = syncop_open(to, loc, O_RDWR, fd, NULL, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "failed to open %s on %s", loc->path, to->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + } else { + ret = syncop_create(to, loc, O_RDWR, DHT_LINKFILE_MODE, fd, &new_stbuf, + dict, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "failed to create %s on %s", loc->path, to->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + } + + fd_bind(fd); + + /*Reason of doing lookup after create again: + *In the create, there is some time-gap between opening fd at the + *server (posix_layer) and binding it in server (incrementing fd count), + *so if in that time-gap, if other process sends unlink considering it + *as a linkto file, because inode->fd count will be 0, so file will be + *unlinked at the backend. And because further operations are performed + *on fd, so though migration will be done but will end with no file + *at the backend. + */ + + ret = syncop_lookup(to, loc, &check_stbuf, NULL, NULL, NULL); + if (!ret) { + if (gf_uuid_compare(stbuf->ia_gfid, check_stbuf.ia_gfid) != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, + "file %s exists in %s with different gfid," + "found in lookup after create", + loc->path, to->name); + *fop_errno = EINVAL; + ret = -1; + goto out; + } + } + + if (-ret == ENOENT) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: file does not exist" + "on %s", + loc->path, to->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + ret = syncop_fsetattr(to, fd, stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + NULL, NULL, NULL, NULL); + if (ret < 0) { + *fop_errno = -ret; + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "chown failed for %s on %s", loc->path, to->name); + } + + /* No need to bother about 0 byte size files */ + if (stbuf->ia_size > 0) { + if (conf->use_fallocate && !file_has_holes) { + ret = syncop_fallocate(to, fd, 0, 0, stbuf->ia_size, NULL, NULL); + if (ret < 0) { + if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -ENOSYS) { + conf->use_fallocate = _gf_false; + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "fallocate failed for %s on %s", loc->path, + to->name); + + *fop_errno = -ret; + + /* fallocate does not release the space + * in some cases + */ + ret2 = syncop_ftruncate(to, fd, 0, NULL, NULL, NULL, NULL); + if (ret2 < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret2, + DHT_MSG_MIGRATE_FILE_FAILED, + "ftruncate failed for " + "%s on %s", + loc->path, to->name); + } + goto out; + } + } + } else { + ret = syncop_ftruncate(to, fd, stbuf->ia_size, NULL, NULL, NULL, + NULL); + if (ret < 0) { + *fop_errno = -ret; + gf_msg(this->name, GF_LOG_WARNING, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "ftruncate failed for %s on %s", loc->path, to->name); + } + } + } + + /* success */ + ret = 0; + + if (dst_fd) + *dst_fd = fd; + +out: + if (ret) { + if (fd) { + fd_unref(fd); + } + } + if (dict) + dict_unref(dict); + + if (xdata) + dict_unref(xdata); + + return ret; +} + +static int +__dht_check_free_space(xlator_t *this, xlator_t *to, xlator_t *from, loc_t *loc, + struct iatt *stbuf, int flag, dht_conf_t *conf, + gf_boolean_t *target_changed, xlator_t **new_subvol, + int *fop_errno) +{ + struct statvfs src_statfs = { + 0, + }; + struct statvfs dst_statfs = { + 0, + }; + int ret = -1; + dict_t *xdata = NULL; + dht_layout_t *layout = NULL; + uint64_t src_statfs_blocks = 1; + uint64_t dst_statfs_blocks = 1; + double dst_post_availspacepercent = 0; + double src_post_availspacepercent = 0; + uint64_t file_blocks = 0; + uint64_t src_total_blocks = 0; + uint64_t dst_total_blocks = 0; + + xdata = dict_new(); + if (!xdata) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "failed to allocate dictionary"); + goto out; + } + + ret = dict_set_int8(xdata, GF_INTERNAL_IGNORE_DEEM_STATFS, 1); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "Failed to set " GF_INTERNAL_IGNORE_DEEM_STATFS " in dict"); + ret = -1; + *fop_errno = ENOMEM; + goto out; + } + + ret = syncop_statfs(from, loc, &src_statfs, xdata, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "failed to get statfs of %s on %s", loc->path, from->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + ret = syncop_statfs(to, loc, &dst_statfs, xdata, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "failed to get statfs of %s on %s", loc->path, to->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + gf_msg_debug(this->name, 0, + "min_free_disk - %f , block available - %" PRId64 + ", block size - %lu", + conf->min_free_disk, dst_statfs.f_bavail, dst_statfs.f_bsize); + + dst_statfs_blocks = dst_statfs.f_bavail * + (dst_statfs.f_frsize / GF_DISK_SECTOR_SIZE); + + src_statfs_blocks = src_statfs.f_bavail * + (src_statfs.f_frsize / GF_DISK_SECTOR_SIZE); + + dst_total_blocks = dst_statfs.f_blocks * + (dst_statfs.f_frsize / GF_DISK_SECTOR_SIZE); + + src_total_blocks = src_statfs.f_blocks * + (src_statfs.f_frsize / GF_DISK_SECTOR_SIZE); + + /* if force option is given, do not check for space @ dst. + * Check only if space is avail for the file */ + if (flag != GF_DHT_MIGRATE_DATA) + goto check_avail_space; + + /* Check: + During rebalance `migrate-data` - Destination subvol experiences + a `reduction` in 'blocks' of free space, at the same time source + subvol gains certain 'blocks' of free space. A valid check is + necessary here to avoid erroneous move to destination where + the space could be scantily available. + With heterogeneous brick support, an actual space comparison could + prevent any files being migrated to newly added bricks if they are + smaller then the free space available on the existing bricks. + */ + if (!conf->use_fallocate) { + file_blocks = stbuf->ia_size + GF_DISK_SECTOR_SIZE - 1; + file_blocks /= GF_DISK_SECTOR_SIZE; + + if (file_blocks >= dst_statfs_blocks) { + dst_statfs_blocks = 0; + } else { + dst_statfs_blocks -= file_blocks; + } + } + + src_post_availspacepercent = ((src_statfs_blocks + file_blocks) * 100) / + src_total_blocks; + + dst_post_availspacepercent = (dst_statfs_blocks * 100) / dst_total_blocks; + + if (dst_post_availspacepercent < src_post_availspacepercent) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "data movement of file " + "{blocks:%" PRIu64 + " name:(%s)} would result in " + "dst node (%s:%" PRIu64 + ") having lower disk " + "space than the source node (%s:%" PRIu64 + ")" + ".Skipping file.", + stbuf->ia_blocks, loc->path, to->name, dst_statfs_blocks, + from->name, src_statfs_blocks); + + /* this is not a 'failure', but we don't want to + consider this as 'success' too :-/ */ + *fop_errno = ENOSPC; + ret = 1; + goto out; + } + +check_avail_space: + if (conf->disk_unit == 'p' && dst_statfs.f_blocks) { + dst_post_availspacepercent = (dst_statfs_blocks * 100) / + dst_total_blocks; + + gf_msg_debug(this->name, 0, + "file : %s, post_availspacepercent" + " : %lf f_bavail : %" PRIu64 " min-free-disk: %lf", + loc->path, dst_post_availspacepercent, dst_statfs.f_bavail, + conf->min_free_disk); + + if (dst_post_availspacepercent < conf->min_free_disk) { + gf_msg(this->name, GF_LOG_WARNING, 0, 0, + "Write will cross min-free-disk for " + "file - %s on subvol - %s. Looking " + "for new subvol", + loc->path, to->name); + + goto find_new_subvol; + } else { + ret = 0; + goto out; + } + } + + if (conf->disk_unit != 'p') { + if ((dst_statfs_blocks * GF_DISK_SECTOR_SIZE) < conf->min_free_disk) { + gf_msg_debug(this->name, 0, + "file : %s, destination frsize: %lu " + "f_bavail : %" PRIu64 " min-free-disk: %lf", + loc->path, dst_statfs.f_frsize, dst_statfs.f_bavail, + conf->min_free_disk); + + gf_msg(this->name, GF_LOG_WARNING, 0, 0, + "write will" + " cross min-free-disk for file - %s on subvol -" + " %s. looking for new subvol", + loc->path, to->name); + + goto find_new_subvol; + + } else { + ret = 0; + goto out; + } + } + +find_new_subvol: + layout = dht_layout_get(this, loc->parent); + if (!layout) { + gf_log(this->name, GF_LOG_ERROR, "Layout is NULL"); + *fop_errno = EINVAL; + ret = -1; + goto out; + } + + *new_subvol = dht_subvol_with_free_space_inodes(this, to, from, layout, + stbuf->ia_size); + if ((!(*new_subvol)) || (*new_subvol == from)) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_INSUFF_SPACE, + "Could not find any subvol" + " with space accommodating the file - %s. Consider " + "adding bricks", + loc->path); + + *target_changed = _gf_false; + *fop_errno = ENOSPC; + ret = -1; + } else { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "new target found - %s" + " for file - %s", + (*new_subvol)->name, loc->path); + *target_changed = _gf_true; + ret = 0; + } + +out: + if (xdata) + dict_unref(xdata); + return ret; +} + +static int +__dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, + xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, + uint64_t ia_size, int hole_exists, int *fop_errno) +{ + int ret = 0; + int count = 0; + off_t offset = 0; + off_t data_offset = 0; + off_t hole_offset = 0; + struct iovec *vector = NULL; + struct iobref *iobref = NULL; + uint64_t total = 0; + size_t read_size = 0; + size_t data_block_size = 0; + dict_t *xdata = NULL; + dht_conf_t *conf = NULL; + + conf = this->private; + + /* if file size is '0', no need to enter this loop */ + while (total < ia_size) { + /* This is a regular file - read it sequentially */ + if (!hole_exists) { + read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) + ? DHT_REBALANCE_BLKSIZE + : (ia_size - total)); + } else { + /* This is a sparse file - read only the data segments in the file + */ + + /* If the previous data block is fully copied, find the next data + * segment + * starting at the offset of the last read and written byte, */ + if (data_block_size <= 0) { + ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL, + &data_offset); + if (ret) { + if (ret == -ENXIO) + ret = 0; /* No more data segments */ + else + *fop_errno = -ret; /* Error occurred */ + + break; + } + + /* If the position of the current data segment is greater than + * the position of the next hole, find the next hole in order to + * calculate the length of the new data segment */ + if (data_offset > hole_offset) { + /* Starting at the offset of the last data segment, find the + * next hole */ + ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE, + NULL, &hole_offset); + if (ret) { + /* If an error occurred here it's a real error because + * if the seek for a data segment was successful then + * necessarily another hole must exist (EOF is a hole) + */ + *fop_errno = -ret; + break; + } + + /* Calculate the total size of the current data block */ + data_block_size = hole_offset - data_offset; + } + } else { + /* There is still data in the current segment, move the + * data_offset to the position of the last written byte */ + data_offset = offset; + } + + /* Calculate how much data needs to be read and written. If the data + * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and + * write DHT_REBALANCE_BLKSIZE data length and the rest in the + * next iteration(s) */ + read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE) + ? DHT_REBALANCE_BLKSIZE + : data_block_size); + + /* Calculate the remaining size of the data block - maybe there's no + * need to seek for data in the next iteration */ + data_block_size -= read_size; + + /* Set offset to the offset of the data segment so read and write + * will have the correct position */ + offset = data_offset; + } + + ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count, + &iobref, NULL, NULL, NULL); + + if (!ret || (ret < 0)) { + if (!ret) { + /* File was probably truncated*/ + ret = -1; + *fop_errno = ENOSPC; + } else { + *fop_errno = -ret; + } + break; + } + + if (!conf->force_migration) { + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "insufficient memory"); + ret = -1; + *fop_errno = ENOMEM; + break; + } + + /* Fail this write and abort rebalance if we + * detect a write from client since migration of + * this file started. This is done to avoid + * potential data corruption due to out of order + * writes from rebalance and client to the same + * region (as compared between src and dst + * files). See + * https://github.com/gluster/glusterfs/issues/308 + * for more details. + */ + ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1); + if (ret) { + gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM, + "failed to set dict"); + ret = -1; + *fop_errno = ENOMEM; + break; + } + } + } + + ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL, + NULL, xdata, NULL); + if (ret < 0) { + *fop_errno = -ret; + break; + } + + offset += ret; + total += ret; + + GF_FREE(vector); + if (iobref) + iobref_unref(iobref); + iobref = NULL; + vector = NULL; + } + if (iobref) + iobref_unref(iobref); + GF_FREE(vector); + + if (ret >= 0) + ret = 0; + else + ret = -1; + + if (xdata) { + dict_unref(xdata); + } + + return ret; +} + +static int +__dht_rebalance_open_src_file(xlator_t *this, xlator_t *from, xlator_t *to, + loc_t *loc, struct iatt *stbuf, fd_t **src_fd, + gf_boolean_t *clean_src, int *fop_errno) +{ + int ret = 0; + fd_t *fd = NULL; + dict_t *dict = NULL; + struct iatt iatt = { + 0, + }; + dht_conf_t *conf = NULL; + + conf = this->private; + + *clean_src = _gf_false; + + fd = fd_create(loc->inode, DHT_REBALANCE_PID); + if (!fd) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: fd create failed (source)", loc->path); + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + + ret = syncop_open(from, loc, O_RDWR, fd, NULL, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "failed to open file %s on %s", loc->path, from->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + fd_bind(fd); + + if (src_fd) + *src_fd = fd; + + ret = -1; + dict = dict_new(); + if (!dict) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: Could not allocate memory for dict", loc->path); + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + + ret = dict_set_str(dict, conf->link_xattr_name, to->name); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "failed to set xattr in dict for %s (linkto:%s)", loc->path, + to->name); + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + + /* Once the migration starts, the source should have 'linkto' key set + to show which is the target, so other clients can work around it */ + ret = syncop_setxattr(from, loc, dict, 0, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "failed to set xattr on %s in %s", loc->path, from->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + /* Reset source mode/xattr if migration fails*/ + *clean_src = _gf_true; + + /* mode should be (+S+T) to indicate migration is in progress */ + iatt.ia_prot = stbuf->ia_prot; + iatt.ia_type = stbuf->ia_type; + iatt.ia_prot.sticky = 1; + iatt.ia_prot.sgid = 1; + + ret = syncop_setattr(from, loc, &iatt, GF_SET_ATTR_MODE, NULL, NULL, NULL, + NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "failed to set mode on %s in %s", loc->path, from->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + /* success */ + ret = 0; +out: + if (dict) + dict_unref(dict); + + return ret; +} + +int +migrate_special_files(xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, + struct iatt *buf, int *fop_errno) +{ + int ret = -1; + dict_t *rsp_dict = NULL; + dict_t *dict = NULL; + char *link = NULL; + struct iatt stbuf = { + 0, + }; + dht_conf_t *conf = this->private; + + dict = dict_new(); + if (!dict) { + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + ret = dict_set_int32(dict, conf->link_xattr_name, 256); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_log(this->name, GF_LOG_ERROR, + "%s: failed to set 'linkto' key in dict", loc->path); + goto out; + } + + /* check in the destination if the file is link file */ + ret = syncop_lookup(to, loc, &stbuf, NULL, dict, &rsp_dict); + if ((ret < 0) && (-ret != ENOENT)) { + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: lookup failed", loc->path); + *fop_errno = -ret; + ret = -1; + goto out; + } + + /* we no more require this key */ + dict_del(dict, conf->link_xattr_name); + + /* file exists in target node, only if it is 'linkfile' its valid, + otherwise, error out */ + if (!ret) { + if (!check_is_linkfile(loc->inode, &stbuf, rsp_dict, + conf->link_xattr_name)) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: file exists in destination", loc->path); + *fop_errno = EINVAL; + ret = -1; + goto out; + } + + /* as file is linkfile, delete it */ + ret = syncop_unlink(to, loc, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to delete the linkfile", loc->path); + *fop_errno = -ret; + ret = -1; + goto out; + } + } + + /* Set the gfid of the source file in dict */ + ret = dict_set_gfuuid(dict, "gfid-req", buf->ia_gfid, true); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_log(this->name, GF_LOG_ERROR, + "%s: failed to set gfid in dict for create", loc->path); + goto out; + } + + /* Create the file in target */ + if (IA_ISLNK(buf->ia_type)) { + /* Handle symlinks separately */ + ret = syncop_readlink(from, loc, &link, buf->ia_size, NULL, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: readlink on symlink failed", loc->path); + *fop_errno = -ret; + ret = -1; + goto out; + } + + ret = syncop_symlink(to, loc, link, 0, dict, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, "%s: creating symlink failed", + loc->path); + *fop_errno = -ret; + ret = -1; + goto out; + } + + goto done; + } + + ret = syncop_mknod(to, loc, st_mode_from_ia(buf->ia_prot, buf->ia_type), + makedev(ia_major(buf->ia_rdev), ia_minor(buf->ia_rdev)), + 0, dict, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: mknod failed", loc->path); + *fop_errno = -ret; + ret = -1; + goto out; + } + +done: + ret = syncop_setattr(to, loc, buf, + (GF_SET_ATTR_MTIME | GF_SET_ATTR_UID | + GF_SET_ATTR_GID | GF_SET_ATTR_MODE), + NULL, NULL, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to perform setattr on %s", loc->path, to->name); + *fop_errno = -ret; + } + + ret = syncop_unlink(from, loc, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: unlink failed", loc->path); + *fop_errno = -ret; + ret = -1; + } + +out: + GF_FREE(link); + if (dict) + dict_unref(dict); + + if (rsp_dict) + dict_unref(rsp_dict); + + return ret; +} + +static int +__dht_migration_cleanup_src_file(xlator_t *this, loc_t *loc, fd_t *fd, + xlator_t *from, ia_prot_t *src_ia_prot) +{ + int ret = -1; + dht_conf_t *conf = NULL; + struct iatt new_stbuf = { + 0, + }; + + if (!this || !fd || !from || !src_ia_prot) { + goto out; + } + + conf = this->private; + + /*Revert source mode and xattr changes*/ + ret = syncop_fstat(from, fd, &new_stbuf, NULL, NULL); + if (ret < 0) { + /* Failed to get the stat info */ + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file cleanup failed: failed to fstat " + "file %s on %s ", + loc->path, from->name); + ret = -1; + goto out; + } + + /* Remove the sticky bit and sgid bit set, reset it to 0*/ + if (!src_ia_prot->sticky) + new_stbuf.ia_prot.sticky = 0; + + if (!src_ia_prot->sgid) + new_stbuf.ia_prot.sgid = 0; + + ret = syncop_fsetattr(from, fd, &new_stbuf, + (GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL, NULL, + NULL, NULL); + + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file cleanup failed:" + "%s: failed to perform fsetattr on %s ", + loc->path, from->name); + ret = -1; + goto out; + } + + ret = syncop_fremovexattr(from, fd, conf->link_xattr_name, 0, NULL); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "%s: failed to remove linkto xattr on %s (%s)", loc->path, + from->name, strerror(-ret)); + ret = -1; + goto out; + } + + ret = 0; + +out: + return ret; +} + +/* + return values: + + -1 : failure + 0 : successfully migrated data + 1 : not a failure, but we can't migrate data as of now +*/ +int +dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, + int flag, int *fop_errno) +{ + int ret = -1; + struct iatt new_stbuf = { + 0, + }; + struct iatt stbuf = { + 0, + }; + struct iatt empty_iatt = { + 0, + }; + ia_prot_t src_ia_prot = { + 0, + }; + fd_t *src_fd = NULL; + fd_t *dst_fd = NULL; + dict_t *dict = NULL; + dict_t *xattr = NULL; + dict_t *xattr_rsp = NULL; + int file_has_holes = 0; + dht_conf_t *conf = this->private; + int rcvd_enoent_from_src = 0; + struct gf_flock flock = { + 0, + }; + struct gf_flock plock = { + 0, + }; + loc_t tmp_loc = { + 0, + }; + loc_t parent_loc = { + 0, + }; + gf_boolean_t inodelk_locked = _gf_false; + gf_boolean_t entrylk_locked = _gf_false; + gf_boolean_t p_locked = _gf_false; + int lk_ret = -1; + gf_defrag_info_t *defrag = NULL; + gf_boolean_t clean_src = _gf_false; + gf_boolean_t clean_dst = _gf_false; + int log_level = GF_LOG_INFO; + gf_boolean_t delete_src_linkto = _gf_true; + lock_migration_info_t locklist; + dict_t *meta_dict = NULL; + gf_boolean_t meta_locked = _gf_false; + gf_boolean_t target_changed = _gf_false; + xlator_t *new_target = NULL; + xlator_t *old_target = NULL; + xlator_t *hashed_subvol = NULL; + fd_t *linkto_fd = NULL; + dict_t *xdata = NULL; + + if (from == to) { + gf_msg_debug(this->name, 0, + "destination and source are same. file %s" + " might have migrated already", + loc->path); + ret = 0; + goto out; + } + + gf_log(this->name, log_level, "%s: attempting to move from %s to %s", + loc->path, from->name, to->name); + + dict = dict_new(); + if (!dict) { + ret = -1; + *fop_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "Could not allocate memory for dict"); + goto out; + } + ret = dict_set_int32(dict, conf->link_xattr_name, 256); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: failed to set 'linkto' key in dict", + loc->path); + goto out; + } + + /* Do not migrate file in case lock migration is not enabled on the + * volume*/ + if (!conf->lock_migration_enabled) { + ret = dict_set_int32(dict, GLUSTERFS_POSIXLK_COUNT, sizeof(int32_t)); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: %s: failed to " + "set " GLUSTERFS_POSIXLK_COUNT " key in dict", + loc->path); + goto out; + } + } else { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "locks will be migrated" + " for file: %s", + loc->path); + } + + /* The file is locked to prevent a rename during a migration. Renames + * and migrations on the file at the same time can lead to data loss. + */ + + ret = dht_build_parent_loc(this, &parent_loc, loc, fop_errno); + if (ret < 0) { + ret = -1; + gf_msg(this->name, GF_LOG_WARNING, *fop_errno, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to build parent loc, which is needed to " + "acquire entrylk to synchronize with renames on this " + "path. Skipping migration", + loc->path); + goto out; + } + + hashed_subvol = dht_subvol_get_hashed(this, loc); + if (hashed_subvol == NULL) { + ret = -1; + gf_msg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: cannot find hashed subvol which is needed to " + "synchronize with renames on this path. " + "Skipping migration", + loc->path); + goto out; + } + + flock.l_type = F_WRLCK; + + tmp_loc.inode = inode_ref(loc->inode); + gf_uuid_copy(tmp_loc.gfid, loc->gfid); + tmp_loc.path = gf_strdup(loc->path); + + /* this inodelk happens with flock.owner being zero. But to synchronize + * hardlink migration we need to have different lkowner for each migration + * Filed a bug here: https://bugzilla.redhat.com/show_bug.cgi?id=1468202 to + * track the fix for this. Currently synclock takes care of synchronizing + * hardlink migration. Once this bug is fixed we can avoid taking synclock + */ + ret = syncop_inodelk(from, DHT_FILE_MIGRATE_DOMAIN, &tmp_loc, F_SETLKW, + &flock, NULL, NULL); + if (ret < 0) { + *fop_errno = -ret; + ret = -1; + gf_msg(this->name, GF_LOG_WARNING, *fop_errno, + DHT_MSG_MIGRATE_FILE_FAILED, + "migrate file failed: " + "%s: failed to lock file on %s", + loc->path, from->name); + goto out; + } + + inodelk_locked = _gf_true; + + /* dht_rename has changed to use entrylk on hashed subvol for + * synchronization. So, rebalance too has to acquire an entrylk on + * hashed subvol. + */ + ret = syncop_entrylk(hashed_subvol, DHT_ENTRY_SYNC_DOMAIN, &parent_loc, + loc->name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL, NULL); + if (ret < 0) { + *fop_errno = -ret; + ret = -1; + gf_msg(this->name, GF_LOG_WARNING, *fop_errno, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to acquire entrylk on subvol %s", loc->path, + hashed_subvol->name); + goto out; + } + + entrylk_locked = _gf_true; + + /* Phase 1 - Data migration is in progress from now on */ + ret = syncop_lookup(from, loc, &stbuf, NULL, dict, &xattr_rsp); + if (ret) { + *fop_errno = -ret; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, *fop_errno, + DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: lookup failed on %s", + loc->path, from->name); + goto out; + } + + /* preserve source mode, so set the same to the destination */ + src_ia_prot = stbuf.ia_prot; + + /* Check if file can be migrated */ + ret = __is_file_migratable(this, loc, &stbuf, xattr_rsp, flag, defrag, conf, + fop_errno); + if (ret) { + if (ret == HARDLINK_MIG_INPROGRESS) + ret = 0; + goto out; + } + + /* Take care of the special files */ + if (!IA_ISREG(stbuf.ia_type)) { + /* Special files */ + ret = migrate_special_files(this, from, to, loc, &stbuf, fop_errno); + goto out; + } + + /* Try to preserve 'holes' while migrating data */ + if (stbuf.ia_size > (stbuf.ia_blocks * GF_DISK_SECTOR_SIZE)) + file_has_holes = 1; + + /* create the destination, with required modes/xattr */ + ret = __dht_rebalance_create_dst_file(this, to, from, loc, &stbuf, &dst_fd, + fop_errno, file_has_holes); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "Create dst failed" + " on - %s for file - %s", + to->name, loc->path); + goto out; + } + + clean_dst = _gf_true; + + ret = __dht_check_free_space(this, to, from, loc, &stbuf, flag, conf, + &target_changed, &new_target, fop_errno); + if (target_changed) { + /* Can't handle for hardlinks. Marking this as failure */ + if (flag == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS || stbuf.ia_nlink > 1) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SUBVOL_INSUFF_SPACE, + "Exiting migration for" + " file - %s. flag - %d, stbuf.ia_nlink - %d", + loc->path, flag, stbuf.ia_nlink); + ret = -1; + goto out; + } + + ret = syncop_ftruncate(to, dst_fd, 0, NULL, NULL, NULL, NULL); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "%s: failed to perform truncate on %s (%s)", loc->path, + to->name, strerror(-ret)); + } + + syncop_close(dst_fd); + dst_fd = NULL; + + old_target = to; + to = new_target; + + clean_dst = _gf_false; + + /* if the file migration is successful to this new target, then + * update the xattr on the old destination to point the new + * destination. We need to do update this only post migration + * as in case of failure the linkto needs to point to the source + * subvol */ + ret = __dht_rebalance_create_dst_file( + this, to, from, loc, &stbuf, &dst_fd, fop_errno, file_has_holes); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "Create dst failed" + " on - %s for file - %s", + to->name, loc->path); + goto out; + } else { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "destination for file " + "- %s is changed to - %s", + loc->path, to->name); + clean_dst = _gf_true; + } + } + + if (ret) { + goto out; + } + + /* Open the source, and also update mode/xattr */ + ret = __dht_rebalance_open_src_file(this, from, to, loc, &stbuf, &src_fd, + &clean_src, fop_errno); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: failed to open %s on %s", loc->path, + from->name); + goto out; + } + + /* TODO: move all xattr related operations to fd based operations */ + ret = syncop_listxattr(from, loc, &xattr, NULL, NULL); + if (ret < 0) { + *fop_errno = -ret; + gf_msg(this->name, GF_LOG_WARNING, *fop_errno, + DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: failed to get xattr from %s", + loc->path, from->name); + ret = -1; + goto out; + } + + /* Copying posix acls to the linkto file messes up the permissions*/ + dht_strip_out_acls(xattr); + + /* Remove the linkto xattr as we don't want to overwrite the value + * set on the dst. + */ + dict_del(xattr, conf->link_xattr_name); + + /* We need to error out if this fails as having the wrong shard xattrs + * set on the dst could cause data corruption + */ + ret = syncop_fsetxattr(to, dst_fd, xattr, 0, NULL, NULL); + if (ret < 0) { + *fop_errno = -ret; + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to set xattr on %s", loc->path, to->name); + ret = -1; + goto out; + } + + if (xattr_rsp) { + /* we no more require this key */ + dict_del(dict, conf->link_xattr_name); + dict_unref(xattr_rsp); + } + + ret = syncop_fstat(from, src_fd, &stbuf, dict, &xattr_rsp); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:failed to lookup %s on %s ", loc->path, + from->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + /* Check again if file has hardlink */ + ret = __check_file_has_hardlink(this, loc, &stbuf, xattr_rsp, flag, defrag, + conf, fop_errno); + if (ret) { + if (ret == HARDLINK_MIG_INPROGRESS) + ret = 0; + goto out; + } + + ret = __dht_rebalance_migrate_data(this, defrag, from, to, src_fd, dst_fd, + stbuf.ia_size, file_has_holes, + fop_errno); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: %s: failed to migrate data", loc->path); + + ret = -1; + goto out; + } + + /* TODO: Sync the locks */ + + xdata = dict_new(); + if (!xdata || dict_set_int8(xdata, "last-fsync", 1)) { + gf_log(this->name, GF_LOG_ERROR, + "%s: failed to set last-fsync flag on " + "%s (%s)", + loc->path, to->name, strerror(ENOMEM)); + } + + ret = syncop_fsync(to, dst_fd, 0, NULL, NULL, xdata, NULL); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, "%s: failed to fsync on %s (%s)", + loc->path, to->name, strerror(-ret)); + *fop_errno = -ret; + } + + /* Phase 2 - Data-Migration Complete, Housekeeping updates pending */ + + ret = syncop_fstat(from, src_fd, &new_stbuf, NULL, NULL); + if (ret < 0) { + /* Failed to get the stat info */ + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: failed to fstat file %s on %s ", loc->path, + from->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + /* Lock the entire source file to prevent clients from taking a + lock on it as dht_lk does not handle file migration. + + This still leaves a small window where conflicting locks can + be granted to different clients. If client1 requests a blocking + lock on the src file, it will be granted after the migrating + process releases its lock. If client2 requests a lock on the dst + data file, it will also be granted, but all FOPs will be redirected + to the dst data file. + */ + + /* Take meta lock */ + + if (conf->lock_migration_enabled) { + meta_dict = dict_new(); + if (!meta_dict) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "dict_new failed"); + + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + + ret = dict_set_str(meta_dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value: key = %s," + " path = %s", + GLUSTERFS_INTERNAL_FOP_KEY, loc->path); + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + + ret = dict_set_int32(meta_dict, GF_META_LOCK_KEY, 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Trace dict_set failed"); + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + + ret = syncop_setxattr(from, loc, meta_dict, 0, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Trace syncop_setxattr metalock failed"); + + *fop_errno = -ret; + ret = -1; + goto out; + } else { + meta_locked = _gf_true; + } + } + + if (!conf->lock_migration_enabled) { + plock.l_type = F_WRLCK; + plock.l_start = 0; + plock.l_len = 0; + plock.l_whence = SEEK_SET; + + ret = syncop_lk(from, src_fd, F_SETLK, &plock, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: Failed to lock on %s", + loc->path, from->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + p_locked = _gf_true; + + } else { + INIT_LIST_HEAD(&locklist.list); + + ret = syncop_getactivelk(from, loc, &locklist, NULL, NULL); + if (ret == 0) { + gf_log(this->name, GF_LOG_INFO, "No active locks on:%s", loc->path); + + } else if (ret > 0) { + ret = syncop_setactivelk(to, loc, &locklist, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_LOCK_MIGRATION_FAILED, "write lock failed on:%s", + loc->path); + + *fop_errno = -ret; + ret = -1; + goto metaunlock; + } + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_LOCK_MIGRATION_FAILED, + "getactivelk failed for file: %s", loc->path); + *fop_errno = -ret; + } + } + + /* source would have both sticky bit and sgid bit set, reset it to 0, + and set the source permission on destination, if it was not set + prior to setting rebalance-modes in source */ + if (!src_ia_prot.sticky) + new_stbuf.ia_prot.sticky = 0; + + if (!src_ia_prot.sgid) + new_stbuf.ia_prot.sgid = 0; + + /* TODO: if the source actually had sticky bit, or sgid bit set, + we are not handling it */ + + ret = syncop_fsetattr( + to, dst_fd, &new_stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL, NULL, + NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: failed to perform setattr on %s ", + loc->path, to->name); + *fop_errno = -ret; + ret = -1; + goto metaunlock; + } + + /* Because 'futimes' is not portable */ + ret = syncop_setattr(to, loc, &new_stbuf, + (GF_SET_ATTR_MTIME | GF_SET_ATTR_ATIME), NULL, NULL, + NULL, NULL); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "%s: failed to perform setattr on %s ", loc->path, to->name); + *fop_errno = -ret; + } + + if (target_changed) { + dict_del(dict, GLUSTERFS_POSIXLK_COUNT); + ret = dict_set_str(dict, conf->link_xattr_name, to->name); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "failed to set xattr in dict for %s (linkto:%s)", loc->path, + to->name); + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + + ret = syncop_setxattr(old_target, loc, dict, 0, NULL, NULL); + if (ret && -ret != ESTALE && -ret != ENOENT) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "failed to set xattr on %s in %s", loc->path, + old_target->name); + *fop_errno = -ret; + ret = -1; + goto out; + } else if (-ret == ESTALE || -ret == ENOENT) { + /* The failure ESTALE indicates that the linkto + * file on the hashed subvol might have been deleted. + * In this case will create a linkto file with new target + * as linkto xattr value*/ + linkto_fd = fd_create(loc->inode, DHT_REBALANCE_PID); + if (!linkto_fd) { + gf_msg(this->name, GF_LOG_ERROR, errno, + DHT_MSG_MIGRATE_FILE_FAILED, "%s: fd create failed", + loc->path); + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + ret = syncop_create(old_target, loc, O_RDWR, DHT_LINKFILE_MODE, + linkto_fd, NULL, dict, NULL); + if (ret != 0 && -ret != EEXIST && -ret != ESTALE) { + *fop_errno = -ret; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "failed to create linkto file on %s in %s", loc->path, + old_target->name); + goto out; + } else if (ret == 0) { + ret = syncop_fsetattr(old_target, linkto_fd, &stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL, + NULL, NULL, NULL); + if (ret < 0) { + *fop_errno = -ret; + gf_msg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "chown failed for %s on %s", loc->path, + old_target->name); + } + } + } + } + + clean_dst = _gf_false; + + /* Posix acls are not set on DHT linkto files as part of the initial + * initial xattrs set on the dst file, so these need + * to be set on the dst file after the linkto attrs are removed. + * TODO: Optimize this. + */ + if (xattr) { + dict_unref(xattr); + xattr = NULL; + } + + /* Set only the Posix ACLs this time */ + ret = syncop_getxattr(from, loc, &xattr, POSIX_ACL_ACCESS_XATTR, NULL, + NULL); + if (ret < 0) { + if ((-ret != ENODATA) && (-ret != ENOATTR)) { + gf_msg(this->name, GF_LOG_WARNING, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: failed to get xattr from %s", + loc->path, from->name); + *fop_errno = -ret; + } + } else { + ret = syncop_setxattr(to, loc, xattr, 0, NULL, NULL); + if (ret < 0) { + /* Potential problem here where Posix ACLs will + * not be set on the target file */ + + gf_msg(this->name, GF_LOG_WARNING, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: failed to set xattr on %s", + loc->path, to->name); + *fop_errno = -ret; + } + } + + /* The src file is being unlinked after this so we don't need + to clean it up */ + clean_src = _gf_false; + + /* Make the source as a linkfile first before deleting it */ + empty_iatt.ia_prot.sticky = 1; + ret = syncop_fsetattr(from, src_fd, &empty_iatt, GF_SET_ATTR_MODE, NULL, + NULL, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: failed to perform setattr on %s ", + loc->path, from->name); + *fop_errno = -ret; + ret = -1; + goto metaunlock; + } + + /* Free up the data blocks on the source node, as the whole + file is migrated */ + ret = syncop_ftruncate(from, src_fd, 0, NULL, NULL, NULL, NULL); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "%s: failed to perform truncate on %s (%s)", loc->path, + from->name, strerror(-ret)); + *fop_errno = -ret; + } + + /* remove the 'linkto' xattr from the destination */ + ret = syncop_fremovexattr(to, dst_fd, conf->link_xattr_name, 0, NULL); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "%s: failed to perform removexattr on %s (%s)", loc->path, + to->name, strerror(-ret)); + *fop_errno = -ret; + } + + /* Do a stat and check the gfid before unlink */ + + /* + * Cached file changes its state from non-linkto to linkto file after + * migrating data. If lookup from any other mount-point is performed, + * converted-linkto-cached file will be treated as a stale and will be + * unlinked. But by this time, file is already migrated. So further + * failure because of ENOENT should not be treated as error + */ + + ret = syncop_stat(from, loc, &empty_iatt, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to do a stat on %s", loc->path, from->name); + + if (-ret != ENOENT) { + *fop_errno = -ret; + ret = -1; + goto metaunlock; + } + + rcvd_enoent_from_src = 1; + } + + if ((gf_uuid_compare(empty_iatt.ia_gfid, loc->gfid) == 0) && + (!rcvd_enoent_from_src) && delete_src_linkto) { + /* take out the source from namespace */ + ret = syncop_unlink(from, loc, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to perform unlink on %s", loc->path, from->name); + *fop_errno = -ret; + ret = -1; + goto metaunlock; + } + } + + ret = syncop_lookup(this, loc, NULL, NULL, NULL, NULL); + if (ret) { + gf_msg_debug(this->name, -ret, + "%s: failed to lookup the file on subvolumes", loc->path); + *fop_errno = -ret; + } + + gf_msg(this->name, log_level, 0, DHT_MSG_MIGRATE_FILE_COMPLETE, + "completed migration of %s from subvolume %s to %s", loc->path, + from->name, to->name); + + ret = 0; + +metaunlock: + + if (conf->lock_migration_enabled && meta_locked) { + dict_del(meta_dict, GF_META_LOCK_KEY); + + ret = dict_set_int32(meta_dict, GF_META_UNLOCK_KEY, 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Trace dict_set failed"); + + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + + if (clean_dst == _gf_false) + ret = dict_set_int32(meta_dict, "status", 1); + else + ret = dict_set_int32(meta_dict, "status", 0); + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Trace dict_set failed"); + + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + + ret = syncop_setxattr(from, loc, meta_dict, 0, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Trace syncop_setxattr meta unlock failed"); + + *fop_errno = -ret; + ret = -1; + goto out; + } + } + +out: + if (clean_src) { + /* Revert source mode and xattr changes*/ + lk_ret = __dht_migration_cleanup_src_file(this, loc, src_fd, from, + &src_ia_prot); + if (lk_ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to cleanup source file on %s", loc->path, + from->name); + } + } + + /* reset the destination back to 0 */ + if (clean_dst) { + lk_ret = syncop_ftruncate(to, dst_fd, 0, NULL, NULL, NULL, NULL); + if (lk_ret) { + gf_msg(this->name, GF_LOG_ERROR, -lk_ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: " + "%s: failed to reset target size back to 0", + loc->path); + } + } + + if (inodelk_locked) { + flock.l_type = F_UNLCK; + + lk_ret = syncop_inodelk(from, DHT_FILE_MIGRATE_DOMAIN, &tmp_loc, + F_SETLK, &flock, NULL, NULL); + if (lk_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -lk_ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to unlock file on %s", loc->path, from->name); + } + } + + if (entrylk_locked) { + lk_ret = syncop_entrylk(hashed_subvol, DHT_ENTRY_SYNC_DOMAIN, + &parent_loc, loc->name, ENTRYLK_UNLOCK, + ENTRYLK_UNLOCK, NULL, NULL); + if (lk_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -lk_ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to unlock entrylk on %s", loc->path, + hashed_subvol->name); + } + } + + if (p_locked) { + plock.l_type = F_UNLCK; + lk_ret = syncop_lk(from, src_fd, F_SETLK, &plock, NULL, NULL); + + if (lk_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -lk_ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to unlock file on %s", loc->path, from->name); + } + } + + lk_ret = syncop_removexattr(to, loc, GF_PROTECT_FROM_EXTERNAL_WRITES, NULL, + NULL); + if (lk_ret && (lk_ret != -ENODATA) && (lk_ret != -ENOATTR)) { + gf_msg(this->name, GF_LOG_WARNING, -lk_ret, 0, + "%s: removexattr failed key %s", loc->path, + GF_PROTECT_FROM_EXTERNAL_WRITES); + } + + if (dict) + dict_unref(dict); + + if (xattr) + dict_unref(xattr); + if (xattr_rsp) + dict_unref(xattr_rsp); + + if (dst_fd) + syncop_close(dst_fd); + + if (src_fd) + syncop_close(src_fd); + if (linkto_fd) + syncop_close(linkto_fd); + + if (xdata) + dict_unref(xdata); + + loc_wipe(&tmp_loc); + loc_wipe(&parent_loc); + + return ret; +} + +static int +rebalance_task(void *data) +{ + int ret = -1; + dht_local_t *local = NULL; + call_frame_t *frame = NULL; + int fop_errno = 0; + + frame = data; + + local = frame->local; + + /* This function is 'synchrounous', hence if it returns, + we are done with the task */ + ret = dht_migrate_file(THIS, &local->loc, local->rebalance.from_subvol, + local->rebalance.target_node, local->flags, + &fop_errno); + + return ret; +} + +static int +rebalance_task_completion(int op_ret, call_frame_t *sync_frame, void *data) +{ + int32_t op_errno = EINVAL; + + if (op_ret == -1) { + /* Failure of migration process, mostly due to write process. + as we can't preserve the exact errno, lets say there was + no space to migrate-data + */ + op_errno = ENOSPC; + } else if (op_ret == 1) { + /* migration didn't happen, but is not a failure, let the user + understand that he doesn't have permission to migrate the + file. + */ + op_ret = -1; + op_errno = EPERM; + } else if (op_ret != 0) { + op_errno = -op_ret; + op_ret = -1; + } + + DHT_STACK_UNWIND(setxattr, sync_frame, op_ret, op_errno, NULL); + return 0; +} + +int +dht_start_rebalance_task(xlator_t *this, call_frame_t *frame) +{ + int ret = -1; + + ret = synctask_new(this->ctx->env, rebalance_task, + rebalance_task_completion, frame, frame); + return ret; +} + +int +gf_listener_stop(xlator_t *this) +{ + glusterfs_ctx_t *ctx = NULL; + cmd_args_t *cmd_args = NULL; + int ret = 0; + + ctx = this->ctx; + GF_ASSERT(ctx); + cmd_args = &ctx->cmd_args; + if (cmd_args->sock_file) { + ret = sys_unlink(cmd_args->sock_file); + if (ret && (ENOENT == errno)) { + ret = 0; + } + } + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, DHT_MSG_SOCKET_ERROR, + "Failed to unlink listener " + "socket %s", + cmd_args->sock_file); + } + return ret; +} + +void +dht_build_root_inode(xlator_t *this, inode_t **inode) +{ + inode_table_t *itable = NULL; + static uuid_t root_gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + + itable = inode_table_new(0, this); + if (!itable) + return; + + *inode = inode_find(itable, root_gfid); +} + +void +dht_build_root_loc(inode_t *inode, loc_t *loc) +{ + loc->path = "/"; + loc->inode = inode; + loc->inode->ia_type = IA_IFDIR; + memset(loc->gfid, 0, 16); + loc->gfid[15] = 1; +} + +/* return values: 1 -> error, bug ignore and continue + 0 -> proceed + -1 -> error, handle it */ +int32_t +gf_defrag_handle_migrate_error(int32_t op_errno, gf_defrag_info_t *defrag) +{ + int ret = 0; + /* if errno is not ENOTCONN, we can still continue + with rebalance process */ + if (op_errno != ENOTCONN) { + ret = 1; + goto out; + } + + if (op_errno == ENOTCONN) { + /* Most probably mount point went missing (mostly due + to a brick down), say rebalance failure to user, + let him restart it if everything is fine */ + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + ret = -1; + goto out; + } + +out: + return ret; +} + +static gf_boolean_t +gf_defrag_pattern_match(gf_defrag_info_t *defrag, char *name, uint64_t size) +{ + gf_defrag_pattern_list_t *trav = NULL; + gf_boolean_t match = _gf_false; + gf_boolean_t ret = _gf_false; + + GF_VALIDATE_OR_GOTO("dht", defrag, out); + + trav = defrag->defrag_pattern; + while (trav) { + if (!fnmatch(trav->path_pattern, name, FNM_NOESCAPE)) { + match = _gf_true; + break; + } + trav = trav->next; + } + + if ((match == _gf_true) && (size >= trav->size)) + ret = _gf_true; + +out: + return ret; +} + +int +dht_dfreaddirp_done(dht_dfoffset_ctx_t *offset_var, int cnt) +{ + int i; + int result = 1; + + for (i = 0; i < cnt; i++) { + if (offset_var[i].readdir_done == 0) { + result = 0; + break; + } + } + return result; +} + +int static gf_defrag_ctx_subvols_init(dht_dfoffset_ctx_t *offset_var, + xlator_t *this) +{ + int i; + dht_conf_t *conf = NULL; + + conf = this->private; + + if (!conf) + return -1; + + for (i = 0; i < conf->local_subvols_cnt; i++) { + offset_var[i].this = conf->local_subvols[i]; + offset_var[i].offset = (off_t)0; + offset_var[i].readdir_done = 0; + } + + return 0; +} + +static int +dht_get_first_non_null_index(subvol_nodeuuids_info_t *entry) +{ + int i = 0; + int index = 0; + + for (i = 0; i < entry->count; i++) { + if (!gf_uuid_is_null(entry->elements[i].uuid)) { + index = i; + goto out; + } + } + + if (i == entry->count) { + index = -1; + } +out: + return index; +} + +/* Return value + * 0 : this node does not migrate the file + * 1 : this node migrates the file + * + * Use the hash value of the gfid to determine which node will migrate files. + * Using the gfid instead of the name also ensures that the same node handles + * all hardlinks. + */ + +gf_boolean_t +gf_defrag_should_i_migrate(xlator_t *this, int local_subvol_index, uuid_t gfid) +{ + gf_boolean_t ret = _gf_false; + int i = local_subvol_index; + char *str = NULL; + uint32_t hashval = 0; + int32_t index = 0; + dht_conf_t *conf = NULL; + char buf[UUID_CANONICAL_FORM_LEN + 1] = { + 0, + }; + subvol_nodeuuids_info_t *entry = NULL; + + conf = this->private; + + /* Pure distribute. A subvol in this case + will be handled by only one node */ + + entry = &(conf->local_nodeuuids[i]); + if (entry->count == 1) { + return 1; + } + + str = uuid_utoa_r(gfid, buf); + if (dht_hash_compute(this, 0, str, &hashval) == 0) { + index = (hashval % entry->count); + if (entry->elements[index].info == REBAL_NODEUUID_MINE) { + /* Index matches this node's nodeuuid.*/ + ret = _gf_true; + goto out; + } + + /* Brick down - some other node has to migrate these files*/ + if (gf_uuid_is_null(entry->elements[index].uuid)) { + /* Fall back to the first non-null index */ + index = dht_get_first_non_null_index(entry); + + if (index == -1) { + /* None of the bricks in the subvol are up. + * CHILD_DOWN will kill the process soon */ + + return _gf_false; + } + + if (entry->elements[index].info == REBAL_NODEUUID_MINE) { + /* Index matches this node's nodeuuid.*/ + ret = _gf_true; + goto out; + } + } + } +out: + return ret; +} + +int +gf_defrag_migrate_single_file(void *opaque) +{ + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + int ret = 0; + gf_dirent_t *entry = NULL; + struct timeval start = { + 0, + }; + loc_t entry_loc = { + 0, + }; + loc_t *loc = NULL; + struct iatt iatt = { + 0, + }; + dict_t *migrate_data = NULL; + struct timeval end = { + 0, + }; + double elapsed = { + 0, + }; + struct dht_container *rebal_entry = NULL; + inode_t *inode = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + call_frame_t *statfs_frame = NULL; + xlator_t *old_THIS = NULL; + data_t *tmp = NULL; + int fop_errno = 0; + gf_dht_migrate_data_type_t rebal_type = GF_DHT_MIGRATE_DATA; + char value[MAX_REBAL_TYPE_SIZE] = { + 0, + }; + struct iatt *iatt_ptr = NULL; + gf_boolean_t update_skippedcount = _gf_true; + int i = 0; + gf_boolean_t should_i_migrate = 0; + + rebal_entry = (struct dht_container *)opaque; + if (!rebal_entry) { + gf_log("DHT", GF_LOG_ERROR, "rebal_entry is NULL"); + ret = -1; + goto out; + } + + this = rebal_entry->this; + + conf = this->private; + + defrag = conf->defrag; + + loc = rebal_entry->parent_loc; + + migrate_data = rebal_entry->migrate_data; + + entry = rebal_entry->df_entry; + iatt_ptr = &entry->d_stat; + + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = -1; + goto out; + } + + if (defrag->stats == _gf_true) { + gettimeofday(&start, NULL); + } + + if (defrag->defrag_pattern && + (gf_defrag_pattern_match(defrag, entry->d_name, + entry->d_stat.ia_size) == _gf_false)) { + gf_log(this->name, GF_LOG_ERROR, "pattern_match failed"); + goto out; + } + + memset(&entry_loc, 0, sizeof(entry_loc)); + + ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name); + if (ret) { + LOCK(&defrag->lock); + { + defrag->total_failures += 1; + } + UNLOCK(&defrag->lock); + + ret = 0; + + gf_log(this->name, GF_LOG_ERROR, "Child loc build failed"); + + goto out; + } + + should_i_migrate = gf_defrag_should_i_migrate( + this, rebal_entry->local_subvol_index, entry->d_stat.ia_gfid); + + gf_uuid_copy(entry_loc.gfid, entry->d_stat.ia_gfid); + + gf_uuid_copy(entry_loc.pargfid, loc->gfid); + + ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL); + + if (!should_i_migrate) { + /* this node isn't supposed to migrate the file. suppressing any + * potential error from lookup as this file is under migration by + * another node */ + if (ret) { + gf_msg_debug(this->name, -ret, + "Ignoring lookup failure: node isn't migrating %s", + entry_loc.path); + ret = 0; + } + gf_msg_debug(this->name, 0, "Don't migrate %s ", entry_loc.path); + goto out; + } + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: %s lookup failed", entry_loc.path); + + /* Increase failure count only for remove-brick op, so that + * user is warned to check the removed-brick for any files left + * unmigrated + */ + if (conf->decommission_subvols_cnt) { + LOCK(&defrag->lock); + { + defrag->total_failures += 1; + } + UNLOCK(&defrag->lock); + } + + ret = 0; + goto out; + } + + iatt_ptr = &iatt; + + hashed_subvol = dht_subvol_get_hashed(this, &entry_loc); + if (!hashed_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Failed to get hashed subvol for %s", entry_loc.path); + ret = 0; + goto out; + } + + cached_subvol = dht_subvol_get_cached(this, entry_loc.inode); + if (!cached_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CACHED_SUBVOL_GET_FAILED, + "Failed to get cached subvol for %s", entry_loc.path); + + ret = 0; + goto out; + } + + if (hashed_subvol == cached_subvol) { + ret = 0; + goto out; + } + + inode = inode_link(entry_loc.inode, entry_loc.parent, entry->d_name, &iatt); + inode_unref(entry_loc.inode); + /* use the inode returned by inode_link */ + entry_loc.inode = inode; + + old_THIS = THIS; + THIS = this; + statfs_frame = create_frame(this, this->ctx->pool); + if (!statfs_frame) { + gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM, + "Insufficient memory. Frame creation failed"); + ret = -1; + goto out; + } + + /* async statfs information for honoring min-free-disk */ + dht_get_du_info(statfs_frame, this, loc); + THIS = old_THIS; + + tmp = dict_get(migrate_data, GF_XATTR_FILE_MIGRATE_KEY); + if (tmp) { + memcpy(value, tmp->data, tmp->len); + if (strcmp(value, "force") == 0) + rebal_type = GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS; + + if (conf->decommission_in_progress) + rebal_type = GF_DHT_MIGRATE_HARDLINK; + } + + ret = dht_migrate_file(this, &entry_loc, cached_subvol, hashed_subvol, + rebal_type, &fop_errno); + if (ret == 1) { + if (fop_errno == ENOSPC) { + gf_msg_debug(this->name, 0, + "migrate-data skipped for" + " %s due to space constraints", + entry_loc.path); + + /* For remove-brick case if the source is not one of the + * removed-brick, do not mark the error as failure */ + if (conf->decommission_subvols_cnt) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i] == cached_subvol) { + LOCK(&defrag->lock); + { + defrag->total_failures += 1; + update_skippedcount = _gf_false; + } + UNLOCK(&defrag->lock); + + break; + } + } + } + + if (update_skippedcount) { + LOCK(&defrag->lock); + { + defrag->skipped += 1; + } + UNLOCK(&defrag->lock); + + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_MIGRATE_FILE_SKIPPED, + "File migration skipped for %s.", entry_loc.path); + } + + } else if (fop_errno == ENOTSUP) { + gf_msg_debug(this->name, 0, + "migrate-data skipped for" + " hardlink %s ", + entry_loc.path); + LOCK(&defrag->lock); + { + defrag->skipped += 1; + } + UNLOCK(&defrag->lock); + + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_MIGRATE_FILE_SKIPPED, + "File migration skipped for %s.", entry_loc.path); + } + + ret = 0; + goto out; + } else if (ret < 0) { + if (fop_errno != EEXIST) { + gf_msg(this->name, GF_LOG_ERROR, fop_errno, + DHT_MSG_MIGRATE_FILE_FAILED, "migrate-data failed for %s", + entry_loc.path); + + LOCK(&defrag->lock); + { + defrag->total_failures += 1; + } + UNLOCK(&defrag->lock); + } + + ret = gf_defrag_handle_migrate_error(fop_errno, defrag); + + if (!ret) { + gf_msg(this->name, GF_LOG_ERROR, fop_errno, + DHT_MSG_MIGRATE_FILE_FAILED, + "migrate-data on %s failed:", entry_loc.path); + } else if (ret == 1) { + ret = 0; + } + + goto out; + } + + LOCK(&defrag->lock); + { + defrag->total_files += 1; + defrag->total_data += iatt.ia_size; + } + UNLOCK(&defrag->lock); + + if (defrag->stats == _gf_true) { + gettimeofday(&end, NULL); + elapsed = gf_tvdiff(&start, &end); + gf_log(this->name, GF_LOG_INFO, + "Migration of " + "file:%s size:%" PRIu64 + " bytes took %.2f" + "secs and ret: %d", + entry_loc.name, iatt.ia_size, elapsed / 1e6, ret); + } + +out: + if (statfs_frame) { + STACK_DESTROY(statfs_frame->root); + } + + if (iatt_ptr) { + LOCK(&defrag->lock); + { + defrag->size_processed += iatt_ptr->ia_size; + } + UNLOCK(&defrag->lock); + } + loc_wipe(&entry_loc); + + return ret; +} + +void * +gf_defrag_task(void *opaque) +{ + struct list_head *q_head = NULL; + struct dht_container *iterator = NULL; + gf_defrag_info_t *defrag = NULL; + int ret = 0; + pid_t pid = GF_CLIENT_PID_DEFRAG; + + defrag = (gf_defrag_info_t *)opaque; + if (!defrag) { + gf_msg("dht", GF_LOG_ERROR, 0, 0, "defrag is NULL"); + goto out; + } + + syncopctx_setfspid(&pid); + + q_head = &(defrag->queue[0].list); + + /* The following while loop will dequeue one entry from the defrag->queue + under lock. We will update the defrag->global_error only when there + is an error which is critical to stop the rebalance process. The stop + message will be intimated to other migrator threads by setting the + defrag->defrag_status to GF_DEFRAG_STATUS_FAILED. + + In defrag->queue, a low watermark (MIN_MIGRATE_QUEUE_COUNT) is + maintained so that crawler does not starve the file migration + workers and a high watermark (MAX_MIGRATE_QUEUE_COUNT) so that + crawler does not go far ahead in filling up the queue. + */ + + while (_gf_true) { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + pthread_cond_broadcast(&defrag->rebalance_crawler_alarm); + pthread_cond_broadcast(&defrag->parallel_migration_cond); + goto out; + } + + pthread_mutex_lock(&defrag->dfq_mutex); + { + /*Throttle down: + If the reconfigured count is less than current thread + count, then the current thread will sleep */ + + /*TODO: Need to refactor the following block to work + *under defrag->lock. For now access + * defrag->current_thread_count and rthcount under + * dfq_mutex lock */ + while (!defrag->crawl_done && (defrag->recon_thread_count < + defrag->current_thread_count)) { + defrag->current_thread_count--; + gf_msg_debug("DHT", 0, + "Thread sleeping. " + "current thread count: %d", + defrag->current_thread_count); + + pthread_cond_wait(&defrag->df_wakeup_thread, + &defrag->dfq_mutex); + + defrag->current_thread_count++; + gf_msg_debug("DHT", 0, + "Thread wokeup. " + "current thread count: %d", + defrag->current_thread_count); + } + + if (defrag->q_entry_count) { + iterator = list_entry(q_head->next, typeof(*iterator), list); + + gf_msg_debug("DHT", 0, + "picking entry " + "%s", + iterator->df_entry->d_name); + + list_del_init(&(iterator->list)); + + defrag->q_entry_count--; + + if ((defrag->q_entry_count < MIN_MIGRATE_QUEUE_COUNT) && + defrag->wakeup_crawler) { + pthread_cond_broadcast(&defrag->rebalance_crawler_alarm); + } + pthread_mutex_unlock(&defrag->dfq_mutex); + ret = gf_defrag_migrate_single_file((void *)iterator); + + /*Critical errors: ENOTCONN and ENOSPACE*/ + if (ret) { + dht_set_global_defrag_error(defrag, ret); + + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + + pthread_cond_broadcast(&defrag->rebalance_crawler_alarm); + + pthread_cond_broadcast(&defrag->parallel_migration_cond); + + goto out; + } + + gf_defrag_free_container(iterator); + + continue; + } else { + /* defrag->crawl_done flag is set means crawling + file system is done and hence a list_empty when + the above flag is set indicates there are no more + entries to be added to the queue and rebalance is + finished */ + + if (!defrag->crawl_done) { + defrag->current_thread_count--; + gf_msg_debug("DHT", 0, + "Thread " + "sleeping while waiting " + "for migration entries. " + "current thread count:%d", + defrag->current_thread_count); + + pthread_cond_wait(&defrag->parallel_migration_cond, + &defrag->dfq_mutex); + } + + if (defrag->crawl_done && !defrag->q_entry_count) { + defrag->current_thread_count++; + gf_msg_debug("DHT", 0, "Exiting thread"); + + pthread_cond_broadcast(&defrag->parallel_migration_cond); + goto unlock; + } else { + defrag->current_thread_count++; + gf_msg_debug("DHT", 0, + "Thread woke up" + " as found migrating entries. " + "current thread count:%d", + defrag->current_thread_count); + + pthread_mutex_unlock(&defrag->dfq_mutex); + continue; + } + } + } + unlock: + pthread_mutex_unlock(&defrag->dfq_mutex); + break; + } +out: + return NULL; +} + +int static gf_defrag_get_entry(xlator_t *this, int i, + struct dht_container **container, loc_t *loc, + dht_conf_t *conf, gf_defrag_info_t *defrag, + fd_t *fd, dict_t *migrate_data, + struct dir_dfmeta *dir_dfmeta, dict_t *xattr_req, + int *perrno) +{ + int ret = 0; + char is_linkfile = 0; + gf_dirent_t *df_entry = NULL; + struct dht_container *tmp_container = NULL; + + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = -1; + goto out; + } + + if (dir_dfmeta->offset_var[i].readdir_done == 1) { + ret = 0; + goto out; + } + + if (dir_dfmeta->fetch_entries[i] == 1) { + if (!fd) { + dir_dfmeta->fetch_entries[i] = 0; + dir_dfmeta->offset_var[i].readdir_done = 1; + ret = 0; + goto out; + } + + ret = syncop_readdirp(conf->local_subvols[i], fd, 131072, + dir_dfmeta->offset_var[i].offset, + &(dir_dfmeta->equeue[i]), xattr_req, NULL); + if (ret == 0) { + dir_dfmeta->offset_var[i].readdir_done = 1; + ret = 0; + goto out; + } + + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, + DHT_MSG_MIGRATE_DATA_FAILED, + "Readdirp failed. Aborting data migration for " + "directory: %s", + loc->path); + *perrno = -ret; + ret = -1; + goto out; + } + + if (list_empty(&(dir_dfmeta->equeue[i].list))) { + dir_dfmeta->offset_var[i].readdir_done = 1; + ret = 0; + goto out; + } + + dir_dfmeta->fetch_entries[i] = 0; + } + + while (1) { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = -1; + goto out; + } + + df_entry = list_entry(dir_dfmeta->iterator[i]->next, typeof(*df_entry), + list); + + if (&df_entry->list == dir_dfmeta->head[i]) { + gf_dirent_free(&(dir_dfmeta->equeue[i])); + INIT_LIST_HEAD(&(dir_dfmeta->equeue[i].list)); + dir_dfmeta->fetch_entries[i] = 1; + dir_dfmeta->iterator[i] = dir_dfmeta->head[i]; + ret = 0; + goto out; + } + + dir_dfmeta->iterator[i] = dir_dfmeta->iterator[i]->next; + + dir_dfmeta->offset_var[i].offset = df_entry->d_off; + if (!strcmp(df_entry->d_name, ".") || !strcmp(df_entry->d_name, "..")) + continue; + + if (IA_ISDIR(df_entry->d_stat.ia_type)) { + defrag->size_processed += df_entry->d_stat.ia_size; + continue; + } + + defrag->num_files_lookedup++; + + if (defrag->defrag_pattern && + (gf_defrag_pattern_match(defrag, df_entry->d_name, + df_entry->d_stat.ia_size) == _gf_false)) { + defrag->size_processed += df_entry->d_stat.ia_size; + continue; + } + + is_linkfile = check_is_linkfile(NULL, &df_entry->d_stat, df_entry->dict, + conf->link_xattr_name); + + if (is_linkfile) { + /* No need to add linkto file to the queue for + migration. Only the actual data file need to + be checked for migration criteria. + */ + + gf_msg_debug(this->name, 0, + "Skipping linkfile" + " %s on subvol: %s", + df_entry->d_name, conf->local_subvols[i]->name); + continue; + } + + /*Build Container Structure */ + + tmp_container = GF_CALLOC(1, sizeof(struct dht_container), + gf_dht_mt_container_t); + if (!tmp_container) { + gf_log(this->name, GF_LOG_ERROR, + "Failed to allocate " + "memory for container"); + ret = -1; + goto out; + } + tmp_container->df_entry = gf_dirent_for_name(df_entry->d_name); + if (!tmp_container->df_entry) { + gf_log(this->name, GF_LOG_ERROR, + "Failed to allocate " + "memory for df_entry"); + ret = -1; + goto out; + } + + tmp_container->local_subvol_index = i; + + tmp_container->df_entry->d_stat = df_entry->d_stat; + + tmp_container->df_entry->d_ino = df_entry->d_ino; + + tmp_container->df_entry->d_type = df_entry->d_type; + + tmp_container->df_entry->d_len = df_entry->d_len; + + tmp_container->parent_loc = GF_CALLOC(1, sizeof(*loc), gf_dht_mt_loc_t); + if (!tmp_container->parent_loc) { + gf_log(this->name, GF_LOG_ERROR, + "Failed to allocate " + "memory for loc"); + ret = -1; + goto out; + } + + ret = loc_copy(tmp_container->parent_loc, loc); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "loc_copy failed"); + ret = -1; + goto out; + } + + tmp_container->migrate_data = migrate_data; + + tmp_container->this = this; + + if (df_entry->dict) + tmp_container->df_entry->dict = dict_ref(df_entry->dict); + + /*Build Container Structure >> END*/ + + ret = 0; + goto out; + } + +out: + if (ret == 0) { + *container = tmp_container; + } else { + if (tmp_container) { + gf_defrag_free_container(tmp_container); + } + } + + return ret; +} + +int +gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + dict_t *migrate_data, int *perrno) +{ + int ret = -1; + dht_conf_t *conf = NULL; + gf_dirent_t entries; + dict_t *xattr_req = NULL; + struct timeval dir_start = { + 0, + }; + struct timeval end = { + 0, + }; + double elapsed = { + 0, + }; + int local_subvols_cnt = 0; + int i = 0; + int j = 0; + struct dht_container *container = NULL; + int ldfq_count = 0; + int dfc_index = 0; + int throttle_up = 0; + struct dir_dfmeta *dir_dfmeta = NULL; + xlator_t *old_THIS = NULL; + + gf_log(this->name, GF_LOG_INFO, "migrate data called on %s", loc->path); + gettimeofday(&dir_start, NULL); + + conf = this->private; + local_subvols_cnt = conf->local_subvols_cnt; + + if (!local_subvols_cnt) { + ret = 0; + goto out; + } + + old_THIS = THIS; + THIS = this; + + dir_dfmeta = GF_CALLOC(1, sizeof(*dir_dfmeta), gf_common_mt_pointer); + if (!dir_dfmeta) { + gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta is NULL"); + ret = -1; + goto out; + } + + dir_dfmeta->lfd = GF_CALLOC(local_subvols_cnt, sizeof(fd_t *), + gf_common_mt_pointer); + if (!dir_dfmeta->lfd) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_INSUFF_MEMORY, + "for dir_dfmeta", NULL); + ret = -1; + *perrno = ENOMEM; + goto out; + } + + for (i = 0; i < local_subvols_cnt; i++) { + dir_dfmeta->lfd[i] = fd_create(loc->inode, defrag->pid); + if (!dir_dfmeta->lfd[i]) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_FD_CREATE_FAILED, + NULL); + *perrno = ENOMEM; + ret = -1; + goto out; + } + + ret = syncop_opendir(conf->local_subvols[i], loc, dir_dfmeta->lfd[i], + NULL, NULL); + if (ret) { + fd_unref(dir_dfmeta->lfd[i]); + dir_dfmeta->lfd[i] = NULL; + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FAILED_TO_OPEN, + "dir: %s", loc->path, "subvol: %s", + conf->local_subvols[i]->name, NULL); + + if (conf->decommission_in_progress) { + *perrno = -ret; + ret = -1; + goto out; + } + } else { + fd_bind(dir_dfmeta->lfd[i]); + } + } + + dir_dfmeta->head = GF_CALLOC(local_subvols_cnt, sizeof(*(dir_dfmeta->head)), + gf_common_mt_pointer); + if (!dir_dfmeta->head) { + gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->head is NULL"); + ret = -1; + goto out; + } + + dir_dfmeta->iterator = GF_CALLOC(local_subvols_cnt, + sizeof(*(dir_dfmeta->iterator)), + gf_common_mt_pointer); + if (!dir_dfmeta->iterator) { + gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->iterator is NULL"); + ret = -1; + goto out; + } + + dir_dfmeta->equeue = GF_CALLOC(local_subvols_cnt, sizeof(entries), + gf_dht_mt_dirent_t); + if (!dir_dfmeta->equeue) { + gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->equeue is NULL"); + ret = -1; + goto out; + } + + dir_dfmeta->offset_var = GF_CALLOC( + local_subvols_cnt, sizeof(dht_dfoffset_ctx_t), gf_dht_mt_octx_t); + if (!dir_dfmeta->offset_var) { + gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->offset_var is NULL"); + ret = -1; + goto out; + } + + ret = gf_defrag_ctx_subvols_init(dir_dfmeta->offset_var, this); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "dht_dfoffset_ctx_t" + "initialization failed"); + ret = -1; + goto out; + } + + dir_dfmeta->fetch_entries = GF_CALLOC(local_subvols_cnt, sizeof(int), + gf_common_mt_int); + if (!dir_dfmeta->fetch_entries) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_INSUFF_MEMORY, + "for dir_dfmeta->fetch_entries", NULL); + ret = -1; + goto out; + } + + for (i = 0; i < local_subvols_cnt; i++) { + INIT_LIST_HEAD(&(dir_dfmeta->equeue[i].list)); + dir_dfmeta->head[i] = &(dir_dfmeta->equeue[i].list); + dir_dfmeta->iterator[i] = dir_dfmeta->head[i]; + dir_dfmeta->fetch_entries[i] = 1; + } + + xattr_req = dict_new(); + if (!xattr_req) { + gf_log(this->name, GF_LOG_ERROR, "dict_new failed"); + ret = -1; + goto out; + } + + ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "failed to set dict for " + "key: %s", + conf->link_xattr_name); + ret = -1; + goto out; + } + + /* + Job: Read entries from each local subvol and store the entries + in equeue array of linked list. Now pick one entry from the + equeue array in a round robin basis and add them to defrag Queue. + */ + + while (!dht_dfreaddirp_done(dir_dfmeta->offset_var, local_subvols_cnt)) { + pthread_mutex_lock(&defrag->dfq_mutex); + { + /*Throttle up: If reconfigured count is higher than + current thread count, wake up the sleeping threads + TODO: Need to refactor this. Instead of making the + thread sleep and wake, we should terminate and spawn + threads on-demand*/ + + if (defrag->recon_thread_count > defrag->current_thread_count) { + throttle_up = (defrag->recon_thread_count - + defrag->current_thread_count); + for (j = 0; j < throttle_up; j++) { + pthread_cond_signal(&defrag->df_wakeup_thread); + } + } + + while (defrag->q_entry_count > MAX_MIGRATE_QUEUE_COUNT) { + defrag->wakeup_crawler = 1; + pthread_cond_wait(&defrag->rebalance_crawler_alarm, + &defrag->dfq_mutex); + } + + ldfq_count = defrag->q_entry_count; + + if (defrag->wakeup_crawler) { + defrag->wakeup_crawler = 0; + } + } + pthread_mutex_unlock(&defrag->dfq_mutex); + + while ( + ldfq_count <= MAX_MIGRATE_QUEUE_COUNT && + !dht_dfreaddirp_done(dir_dfmeta->offset_var, local_subvols_cnt)) { + ret = gf_defrag_get_entry(this, dfc_index, &container, loc, conf, + defrag, dir_dfmeta->lfd[dfc_index], + migrate_data, dir_dfmeta, xattr_req, + perrno); + + if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED) { + goto out; + } + + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "Found " + "error from gf_defrag_get_entry"); + + ret = -1; + goto out; + } + + /* Check if we got an entry, else we need to move the + index to the next subvol */ + if (!container) { + GF_CRAWL_INDEX_MOVE(dfc_index, local_subvols_cnt); + continue; + } + + /* Q this entry in the dfq */ + pthread_mutex_lock(&defrag->dfq_mutex); + { + list_add_tail(&container->list, &(defrag->queue[0].list)); + defrag->q_entry_count++; + ldfq_count = defrag->q_entry_count; + + gf_msg_debug(this->name, 0, + "added " + "file:%s parent:%s to the queue ", + container->df_entry->d_name, + container->parent_loc->path); + + pthread_cond_signal(&defrag->parallel_migration_cond); + } + pthread_mutex_unlock(&defrag->dfq_mutex); + + GF_CRAWL_INDEX_MOVE(dfc_index, local_subvols_cnt); + } + } + + gettimeofday(&end, NULL); + elapsed = gf_tvdiff(&dir_start, &end); + gf_log(this->name, GF_LOG_INFO, + "Migration operation on dir %s took " + "%.2f secs", + loc->path, elapsed / 1e6); + ret = 0; +out: + THIS = old_THIS; + gf_defrag_free_dir_dfmeta(dir_dfmeta, local_subvols_cnt); + + if (xattr_req) + dict_unref(xattr_req); + + /* It does not matter if it errored out - this number is + * used to calculate rebalance estimated time to complete. + * No locking required as dirs are processed by a single thread. + */ + defrag->num_dirs_processed++; + return ret; +} + +int +gf_defrag_settle_hash(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + dict_t *fix_layout) +{ + int ret; + dht_conf_t *conf = NULL; + /* + * Now we're ready to update the directory commit hash for the volume + * root, so that hash miscompares and broadcast lookups can stop. + * However, we want to skip that if fix-layout is all we did. In + * that case, we want the miscompares etc. to continue until a real + * rebalance is complete. + */ + if (defrag->cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX || + defrag->cmd == GF_DEFRAG_CMD_DETACH_START) { + return 0; + } + + conf = this->private; + if (!conf) { + /*Uh oh + */ + return -1; + } + + if (conf->local_subvols_cnt == 0 || !conf->lookup_optimize) { + /* Commit hash updates are only done on local subvolumes and + * only when lookup optimization is needed (for older client + * support) + */ + return 0; + } + + ret = dict_set_uint32(fix_layout, "new-commit-hash", + defrag->new_commit_hash); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "Failed to set new-commit-hash"); + return -1; + } + + ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED, + "fix layout on %s failed", loc->path); + + if (-ret == ENOENT || -ret == ESTALE) { + /* Dir most likely is deleted */ + return 0; + } + + return -1; + } + + /* TBD: find more efficient solution than adding/deleting every time */ + dict_del(fix_layout, "new-commit-hash"); + + return 0; +} + +int +gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + dict_t *fix_layout, dict_t *migrate_data) +{ + int ret = -1; + loc_t entry_loc = { + 0, + }; + fd_t *fd = NULL; + gf_dirent_t entries; + gf_dirent_t *tmp = NULL; + gf_dirent_t *entry = NULL; + gf_boolean_t free_entries = _gf_false; + off_t offset = 0; + struct iatt iatt = { + 0, + }; + inode_t *linked_inode = NULL, *inode = NULL; + dht_conf_t *conf = NULL; + int perrno = 0; + + conf = this->private; + if (!conf) { + ret = -1; + goto out; + } + + ret = syncop_lookup(this, loc, &iatt, NULL, NULL, NULL); + if (ret) { + if (strcmp(loc->path, "/") == 0) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_DIR_LOOKUP_FAILED, + "lookup failed for:%s", loc->path); + + defrag->total_failures++; + ret = -1; + goto out; + } + + if (-ret == ENOENT || -ret == ESTALE) { + gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_LOOKUP_FAILED, + "Dir:%s renamed or removed. Skipping", loc->path); + if (conf->decommission_subvols_cnt) { + defrag->total_failures++; + } + ret = 0; + goto out; + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_DIR_LOOKUP_FAILED, + "lookup failed for:%s", loc->path); + + defrag->total_failures++; + goto out; + } + } + + fd = fd_create(loc->inode, defrag->pid); + if (!fd) { + gf_log(this->name, GF_LOG_ERROR, "Failed to create fd"); + ret = -1; + goto out; + } + + ret = syncop_opendir(this, loc, fd, NULL, NULL); + if (ret) { + if (-ret == ENOENT || -ret == ESTALE) { + if (conf->decommission_subvols_cnt) { + defrag->total_failures++; + } + ret = 0; + goto out; + } + + gf_log(this->name, GF_LOG_ERROR, + "Failed to open dir %s, " + "err:%d", + loc->path, -ret); + + ret = -1; + goto out; + } + + fd_bind(fd); + INIT_LIST_HEAD(&entries.list); + + while ((ret = syncop_readdirp(this, fd, 131072, offset, &entries, NULL, + NULL)) != 0) { + if (ret < 0) { + if (-ret == ENOENT || -ret == ESTALE) { + if (conf->decommission_subvols_cnt) { + defrag->total_failures++; + } + ret = 0; + goto out; + } + + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_READDIR_ERROR, + "readdirp failed for " + "path %s. Aborting fix-layout", + loc->path); + + ret = -1; + goto out; + } + + if (list_empty(&entries.list)) + break; + + free_entries = _gf_true; + + list_for_each_entry_safe(entry, tmp, &entries.list, list) + { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = 1; + goto out; + } + + offset = entry->d_off; + + if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) + continue; + if (!IA_ISDIR(entry->d_stat.ia_type)) { + continue; + } + loc_wipe(&entry_loc); + + ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "Child loc" + " build failed for entry: %s", + entry->d_name); + + if (conf->decommission_in_progress) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + + goto out; + } else { + continue; + } + } + + if (gf_uuid_is_null(entry->d_stat.ia_gfid)) { + gf_log(this->name, GF_LOG_ERROR, + "%s/%s" + " gfid not present", + loc->path, entry->d_name); + continue; + } + + gf_uuid_copy(entry_loc.gfid, entry->d_stat.ia_gfid); + + /*In case the gfid stored in the inode by inode_link + * and the gfid obtained in the lookup differs, then + * client3_3_lookup_cbk will return ESTALE and proper + * error will be captured + */ + + linked_inode = inode_link(entry_loc.inode, loc->inode, + entry->d_name, &entry->d_stat); + + inode = entry_loc.inode; + entry_loc.inode = linked_inode; + inode_unref(inode); + + if (gf_uuid_is_null(loc->gfid)) { + gf_log(this->name, GF_LOG_ERROR, + "%s/%s" + " gfid not present", + loc->path, entry->d_name); + continue; + } + + gf_uuid_copy(entry_loc.pargfid, loc->gfid); + + ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL); + if (ret) { + if (-ret == ENOENT || -ret == ESTALE) { + gf_msg(this->name, GF_LOG_INFO, -ret, + DHT_MSG_DIR_LOOKUP_FAILED, + "Dir:%s renamed or removed. " + "Skipping", + loc->path); + ret = 0; + if (conf->decommission_subvols_cnt) { + defrag->total_failures++; + } + continue; + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_DIR_LOOKUP_FAILED, "lookup failed for:%s", + entry_loc.path); + + defrag->total_failures++; + + if (conf->decommission_in_progress) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + ret = -1; + goto out; + } else { + continue; + } + } + } + + /* A return value of 2 means, either process_dir or + * lookup of a dir failed. Hence, don't commit hash + * for the current directory*/ + + ret = gf_defrag_fix_layout(this, defrag, &entry_loc, fix_layout, + migrate_data); + + if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED || + defrag->defrag_status == GF_DEFRAG_STATUS_FAILED) { + goto out; + } + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_FIX_FAILED, + "Fix layout failed for %s", entry_loc.path); + + defrag->total_failures++; + + if (conf->decommission_in_progress) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + + goto out; + } else { + /* Let's not commit-hash if + * gf_defrag_fix_layout failed*/ + continue; + } + } + } + + gf_dirent_free(&entries); + free_entries = _gf_false; + INIT_LIST_HEAD(&entries.list); + } + + /* A directory layout is fixed only after its subdirs are healed to + * any newly added bricks. If the layout is fixed before subdirs are + * healed, the newly added brick will get a non-null layout. + * Any subdirs which hash to that layout will no longer show up + * in a directory listing until they are healed. + */ + + ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL); + + /* In case of a race where the directory is deleted just before + * layout setxattr, the errors are updated in the layout structure. + * We can use this information to make a decision whether the directory + * is deleted entirely. + */ + if (ret == 0) { + ret = dht_dir_layout_error_check(this, loc->inode); + ret = -ret; + } + + if (ret) { + if (-ret == ENOENT || -ret == ESTALE) { + gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_LAYOUT_FIX_FAILED, + "Setxattr failed. Dir %s " + "renamed or removed", + loc->path); + if (conf->decommission_subvols_cnt) { + defrag->total_failures++; + } + ret = 0; + goto out; + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED, + "Setxattr failed for %s", loc->path); + + defrag->total_failures++; + + if (conf->decommission_in_progress) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + ret = -1; + goto out; + } + } + } + + if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { + ret = gf_defrag_process_dir(this, defrag, loc, migrate_data, &perrno); + + if (ret) { + if (perrno == ENOENT || perrno == ESTALE) { + ret = 0; + goto out; + } else { + defrag->total_failures++; + + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_DEFRAG_PROCESS_DIR_FAILED, + "gf_defrag_process_dir failed for " + "directory: %s", + loc->path); + + if (conf->decommission_in_progress) { + goto out; + } + } + } + } + + gf_msg_trace(this->name, 0, "fix layout called on %s", loc->path); + + if (gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) { + defrag->total_failures++; + + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SETTLE_HASH_FAILED, + "Settle hash failed for %s", loc->path); + + ret = -1; + + if (conf->decommission_in_progress) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + goto out; + } + } + + ret = 0; +out: + if (free_entries) + gf_dirent_free(&entries); + + loc_wipe(&entry_loc); + + if (fd) + fd_unref(fd); + + return ret; +} + +int +dht_init_local_subvols_and_nodeuuids(xlator_t *this, dht_conf_t *conf, + loc_t *loc) +{ + dict_t *dict = NULL; + uuid_t *uuid_ptr = NULL; + int ret = -1; + int i = 0; + int j = 0; + + /* Find local subvolumes */ + ret = syncop_getxattr(this, loc, &dict, GF_REBAL_FIND_LOCAL_SUBVOL, NULL, + NULL); + if (ret && (ret != -ENODATA)) { + gf_msg(this->name, GF_LOG_ERROR, -ret, 0, + "local " + "subvolume determination failed with error: %d", + -ret); + ret = -1; + goto out; + } + + if (!ret) + goto out; + + ret = syncop_getxattr(this, loc, &dict, GF_REBAL_OLD_FIND_LOCAL_SUBVOL, + NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, 0, + "local " + "subvolume determination failed with error: %d", + -ret); + ret = -1; + goto out; + } + ret = 0; + +out: + if (ret) { + return ret; + } + + for (i = 0; i < conf->local_subvols_cnt; i++) { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "local subvol: " + "%s", + conf->local_subvols[i]->name); + + for (j = 0; j < conf->local_nodeuuids[i].count; j++) { + uuid_ptr = &(conf->local_nodeuuids[i].elements[j].uuid); + gf_msg(this->name, GF_LOG_INFO, 0, 0, "node uuid : %s", + uuid_utoa(*uuid_ptr)); + } + } + + return ret; +} + +/* Functions for the rebalance estimates feature */ + +uint64_t +gf_defrag_subvol_file_size(xlator_t *this, loc_t *root_loc) +{ + int ret = -1; + struct statvfs buf = { + 0, + }; + + ret = syncop_statfs(this, root_loc, &buf, NULL, NULL); + if (ret) { + /* Aargh! */ + return 0; + } + return ((buf.f_blocks - buf.f_bfree) * buf.f_frsize); +} + +uint64_t +gf_defrag_total_file_size(xlator_t *this, loc_t *root_loc) +{ + dht_conf_t *conf = NULL; + int i = 0; + uint64_t size_files = 0; + uint64_t total_size = 0; + + conf = this->private; + if (!conf) { + return 0; + } + + for (i = 0; i < conf->local_subvols_cnt; i++) { + size_files = gf_defrag_subvol_file_size(conf->local_subvols[i], + root_loc); + total_size += size_files; + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "local subvol: %s," + "cnt = %" PRIu64, + conf->local_subvols[i]->name, size_files); + } + + gf_msg(this->name, GF_LOG_INFO, 0, 0, "Total size files = %" PRIu64, + total_size); + + return total_size; +} + +static void * +dht_file_counter_thread(void *args) +{ + gf_defrag_info_t *defrag = NULL; + loc_t root_loc = { + 0, + }; + struct timespec time_to_wait = { + 0, + }; + uint64_t tmp_size = 0; + + if (!args) + return NULL; + + defrag = (gf_defrag_info_t *)args; + dht_build_root_loc(defrag->root_inode, &root_loc); + + while (defrag->defrag_status == GF_DEFRAG_STATUS_STARTED) { + timespec_now(&time_to_wait); + time_to_wait.tv_sec += 600; + + pthread_mutex_lock(&defrag->fc_mutex); + pthread_cond_timedwait(&defrag->fc_wakeup_cond, &defrag->fc_mutex, + &time_to_wait); + + pthread_mutex_unlock(&defrag->fc_mutex); + + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) + break; + + tmp_size = gf_defrag_total_file_size(defrag->this, &root_loc); + + gf_log("dht", GF_LOG_INFO, "tmp data size =%" PRIu64, tmp_size); + + if (!tmp_size) { + gf_msg("dht", GF_LOG_ERROR, 0, 0, + "Failed to get " + "the total data size. Unable to estimate " + "time to complete rebalance."); + } else { + g_totalsize = tmp_size; + gf_msg_debug("dht", 0, "total data size =%" PRIu64, g_totalsize); + } + } + + return NULL; +} + +int +gf_defrag_estimates_cleanup(xlator_t *this, gf_defrag_info_t *defrag, + pthread_t filecnt_thread) +{ + int ret = -1; + + /* Wake up the filecounter thread. + * By now the defrag status will no longer be + * GF_DEFRAG_STATUS_STARTED so the thread will exit the loop. + */ + pthread_mutex_lock(&defrag->fc_mutex); + { + pthread_cond_broadcast(&defrag->fc_wakeup_cond); + } + pthread_mutex_unlock(&defrag->fc_mutex); + + ret = pthread_join(filecnt_thread, NULL); + if (ret) { + gf_msg("dht", GF_LOG_ERROR, ret, 0, + "file_counter_thread: pthread_join failed."); + ret = -1; + } + return ret; +} + +int +gf_defrag_estimates_init(xlator_t *this, loc_t *loc, pthread_t *filecnt_thread) +{ + int ret = -1; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + + conf = this->private; + defrag = conf->defrag; + + g_totalsize = gf_defrag_total_file_size(this, loc); + if (!g_totalsize) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "Failed to get " + "the total data size. Unable to estimate " + "time to complete rebalance."); + goto out; + } + + ret = gf_thread_create(filecnt_thread, NULL, dht_file_counter_thread, + (void *)defrag, "dhtfcnt"); + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ret, 0, + "Failed to " + "create the file counter thread "); + ret = -1; + goto out; + } + ret = 0; +out: + return ret; +} + +/* Init and cleanup functions for parallel file migration*/ +int +gf_defrag_parallel_migration_init(xlator_t *this, gf_defrag_info_t *defrag, + pthread_t **tid_array, int *thread_index) +{ + int ret = -1; + int thread_spawn_count = 0; + int index = 0; + pthread_t *tid = NULL; + + if (!defrag) + goto out; + + /* Initialize global entry queue */ + defrag->queue = GF_CALLOC(1, sizeof(struct dht_container), + gf_dht_mt_container_t); + + if (!defrag->queue) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, + "Failed to initialise migration queue"); + ret = -1; + goto out; + } + + INIT_LIST_HEAD(&(defrag->queue[0].list)); + + thread_spawn_count = MAX(MAX_REBAL_THREADS, 4); + + gf_msg_debug(this->name, 0, "thread_spawn_count: %d", thread_spawn_count); + + tid = GF_CALLOC(thread_spawn_count, sizeof(pthread_t), + gf_common_mt_pthread_t); + if (!tid) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, + "Failed to create migration threads"); + ret = -1; + goto out; + } + defrag->current_thread_count = thread_spawn_count; + + /*Spawn Threads Here*/ + while (index < thread_spawn_count) { + ret = gf_thread_create(&(tid[index]), NULL, gf_defrag_task, + (void *)defrag, "dhtmig%d", (index + 1) & 0x3ff); + if (ret != 0) { + gf_msg("DHT", GF_LOG_ERROR, ret, 0, "Thread[%d] creation failed. ", + index); + ret = -1; + goto out; + } else { + gf_log("DHT", GF_LOG_INFO, + "Thread[%d] " + "creation successful", + index); + } + index++; + } + + ret = 0; +out: + *thread_index = index; + *tid_array = tid; + + return ret; +} + +int +gf_defrag_parallel_migration_cleanup(gf_defrag_info_t *defrag, + pthread_t *tid_array, int thread_index) +{ + int ret = -1; + int i = 0; + + if (!defrag) + goto out; + + /* Wake up all migration threads */ + pthread_mutex_lock(&defrag->dfq_mutex); + { + defrag->crawl_done = 1; + + pthread_cond_broadcast(&defrag->parallel_migration_cond); + pthread_cond_broadcast(&defrag->df_wakeup_thread); + } + pthread_mutex_unlock(&defrag->dfq_mutex); + + /*Wait for all the threads to complete their task*/ + for (i = 0; i < thread_index; i++) { + pthread_join(tid_array[i], NULL); + } + + GF_FREE(tid_array); + + /* Cleanup the migration queue */ + if (defrag->queue) { + gf_dirent_free(defrag->queue[0].df_entry); + INIT_LIST_HEAD(&(defrag->queue[0].list)); + } + + GF_FREE(defrag->queue); + + ret = 0; +out: + return ret; +} + +int +gf_defrag_start_crawl(void *data) +{ + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + dict_t *fix_layout = NULL; + dict_t *migrate_data = NULL; + dict_t *status = NULL; + glusterfs_ctx_t *ctx = NULL; + call_frame_t *statfs_frame = NULL; + xlator_t *old_THIS = NULL; + int ret = -1; + loc_t loc = { + 0, + }; + struct iatt iatt = { + 0, + }; + struct iatt parent = { + 0, + }; + int thread_index = 0; + pthread_t *tid = NULL; + pthread_t filecnt_thread; + gf_boolean_t fc_thread_started = _gf_false; + + this = data; + if (!this) + goto exit; + + ctx = this->ctx; + if (!ctx) + goto exit; + + conf = this->private; + if (!conf) + goto exit; + + defrag = conf->defrag; + if (!defrag) + goto exit; + + defrag->start_time = gf_time(); + + dht_build_root_inode(this, &defrag->root_inode); + if (!defrag->root_inode) + goto out; + + dht_build_root_loc(defrag->root_inode, &loc); + + /* fix-layout on '/' first */ + + ret = syncop_lookup(this, &loc, &iatt, &parent, NULL, NULL); + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_REBALANCE_START_FAILED, + "Failed to start rebalance: look up on / failed"); + ret = -1; + goto out; + } + + old_THIS = THIS; + THIS = this; + + statfs_frame = create_frame(this, this->ctx->pool); + if (!statfs_frame) { + gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM, + "Insufficient memory. Frame creation failed"); + ret = -1; + goto out; + } + + /* async statfs update for honoring min-free-disk */ + dht_get_du_info(statfs_frame, this, &loc); + THIS = old_THIS; + + fix_layout = dict_new(); + if (!fix_layout) { + ret = -1; + goto out; + } + + /* + * Unfortunately, we can't do special xattrs (like fix.layout) and + * real ones in the same call currently, and changing it seems + * riskier than just doing two calls. + */ + + gf_log(this->name, GF_LOG_INFO, "%s using commit hash %u", __func__, + conf->vol_commit_hash); + + ret = dict_set_uint32(fix_layout, conf->commithash_xattr_name, + conf->vol_commit_hash); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "Failed to set %s", + conf->commithash_xattr_name); + defrag->total_failures++; + ret = -1; + goto out; + } + + ret = syncop_setxattr(this, &loc, fix_layout, 0, NULL, NULL); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "Failed to set commit hash on %s. " + "Rebalance cannot proceed.", + loc.path); + defrag->total_failures++; + ret = -1; + goto out; + } + + /* We now return to our regularly scheduled program. */ + + ret = dict_set_str(fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes"); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_START_FAILED, + "Failed to start rebalance:" + "Failed to set dictionary value: key = %s", + GF_XATTR_FIX_LAYOUT_KEY); + defrag->total_failures++; + ret = -1; + goto out; + } + + defrag->new_commit_hash = conf->vol_commit_hash; + + ret = syncop_setxattr(this, &loc, fix_layout, 0, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_REBALANCE_FAILED, + "fix layout on %s failed", loc.path); + defrag->total_failures++; + ret = -1; + goto out; + } + + if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { + /* We need to migrate files */ + + migrate_data = dict_new(); + if (!migrate_data) { + defrag->total_failures++; + ret = -1; + goto out; + } + ret = dict_set_str( + migrate_data, GF_XATTR_FILE_MIGRATE_KEY, + (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) ? "force" : "non-force"); + if (ret) { + defrag->total_failures++; + ret = -1; + goto out; + } + + ret = dht_init_local_subvols_and_nodeuuids(this, conf, &loc); + if (ret) { + ret = -1; + goto out; + } + + /* Initialise the structures required for parallel migration */ + ret = gf_defrag_parallel_migration_init(this, defrag, &tid, + &thread_index); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "Aborting rebalance."); + goto out; + } + + ret = gf_defrag_estimates_init(this, &loc, &filecnt_thread); + if (ret) { + /* Not a fatal error. Allow the rebalance to proceed*/ + ret = 0; + } else { + fc_thread_started = _gf_true; + } + } + + ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, migrate_data); + if (ret) { + defrag->total_failures++; + ret = -1; + goto out; + } + + if (gf_defrag_settle_hash(this, defrag, &loc, fix_layout) != 0) { + defrag->total_failures++; + ret = -1; + goto out; + } + + gf_log("DHT", GF_LOG_INFO, "crawling file-system completed"); +out: + + /* We are here means crawling the entire file system is done + or something failed. Set defrag->crawl_done flag to intimate + the migrator threads to exhaust the defrag->queue and terminate*/ + + if (ret) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + } + + gf_defrag_parallel_migration_cleanup(defrag, tid, thread_index); + + if ((defrag->defrag_status != GF_DEFRAG_STATUS_STOPPED) && + (defrag->defrag_status != GF_DEFRAG_STATUS_FAILED)) { + defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE; + } + + if (fc_thread_started) { + gf_defrag_estimates_cleanup(this, defrag, filecnt_thread); + } + + dht_send_rebalance_event(this, defrag->cmd, defrag->defrag_status); + + status = dict_new(); + LOCK(&defrag->lock); + { + gf_defrag_status_get(conf, status); + if (ctx && ctx->notify) + ctx->notify(GF_EN_DEFRAG_STATUS, status); + if (status) + dict_unref(status); + defrag->is_exiting = 1; + } + UNLOCK(&defrag->lock); + + GF_FREE(defrag); + conf->defrag = NULL; + + if (migrate_data) + dict_unref(migrate_data); + + if (statfs_frame) { + STACK_DESTROY(statfs_frame->root); + } +exit: + return ret; +} + +static int +gf_defrag_done(int ret, call_frame_t *sync_frame, void *data) +{ + gf_listener_stop(sync_frame->this); + + STACK_DESTROY(sync_frame->root); + kill(getpid(), SIGTERM); + return 0; +} + +void * +gf_defrag_start(void *data) +{ + int ret = -1; + call_frame_t *frame = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + xlator_t *this = NULL; + xlator_t *old_THIS = NULL; + + this = data; + conf = this->private; + if (!conf) + goto out; + + defrag = conf->defrag; + if (!defrag) + goto out; + + frame = create_frame(this, this->ctx->pool); + if (!frame) + goto out; + + frame->root->pid = GF_CLIENT_PID_DEFRAG; + + defrag->pid = frame->root->pid; + + defrag->defrag_status = GF_DEFRAG_STATUS_STARTED; + + old_THIS = THIS; + THIS = this; + ret = synctask_new(this->ctx->env, gf_defrag_start_crawl, gf_defrag_done, + frame, this); + + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_START_FAILED, + "Could not create task for rebalance"); + THIS = old_THIS; +out: + return NULL; +} + +uint64_t +gf_defrag_get_estimates_based_on_size(dht_conf_t *conf) +{ + gf_defrag_info_t *defrag = NULL; + double rate_processed = 0; + uint64_t total_processed = 0; + uint64_t tmp_count = 0; + uint64_t time_to_complete = 0; + double elapsed = 0; + + defrag = conf->defrag; + + if (!g_totalsize) + goto out; + + elapsed = gf_time() - defrag->start_time; + + /* Don't calculate the estimates for the first 10 minutes. + * It is unlikely to be accurate and estimates are not required + * if the process finishes in less than 10 mins. + */ + + if (elapsed < ESTIMATE_START_INTERVAL) { + gf_msg(THIS->name, GF_LOG_INFO, 0, 0, + "Rebalance estimates will not be available for the " + "first %d seconds.", + ESTIMATE_START_INTERVAL); + + goto out; + } + + total_processed = defrag->size_processed; + + /* rate at which files processed */ + rate_processed = (total_processed) / elapsed; + + tmp_count = g_totalsize; + + if (rate_processed) { + time_to_complete = (tmp_count) / rate_processed; + + } else { + gf_msg(THIS->name, GF_LOG_ERROR, 0, 0, + "Unable to calculate estimated time for rebalance"); + } + + gf_log(THIS->name, GF_LOG_INFO, + "TIME: (size) total_processed=%" PRIu64 " tmp_cnt = %" PRIu64 + "," + "rate_processed=%f, elapsed = %f", + total_processed, tmp_count, rate_processed, elapsed); + +out: + return time_to_complete; +} + +int +gf_defrag_status_get(dht_conf_t *conf, dict_t *dict) +{ + int ret = 0; + uint64_t files = 0; + uint64_t size = 0; + uint64_t lookup = 0; + uint64_t failures = 0; + uint64_t skipped = 0; + char *status = ""; + double elapsed = 0; + uint64_t time_to_complete = 0; + uint64_t time_left = 0; + gf_defrag_info_t *defrag = conf->defrag; + + if (!defrag) + goto out; + + ret = 0; + if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) + goto out; + + files = defrag->total_files; + size = defrag->total_data; + lookup = defrag->num_files_lookedup; + failures = defrag->total_failures; + skipped = defrag->skipped; + + elapsed = gf_time() - defrag->start_time; + + /* The rebalance is still in progress */ + + if (defrag->defrag_status == GF_DEFRAG_STATUS_STARTED) { + time_to_complete = gf_defrag_get_estimates_based_on_size(conf); + + if (time_to_complete && (time_to_complete > elapsed)) + time_left = time_to_complete - elapsed; + + gf_log(THIS->name, GF_LOG_INFO, + "TIME: Estimated total time to complete (size)= %" PRIu64 + " seconds, seconds left = %" PRIu64 "", + time_to_complete, time_left); + } + + if (!dict) + goto log; + + ret = dict_set_uint64(dict, "files", files); + if (ret) + gf_log(THIS->name, GF_LOG_WARNING, "failed to set file count"); + + ret = dict_set_uint64(dict, "size", size); + if (ret) + gf_log(THIS->name, GF_LOG_WARNING, "failed to set size of xfer"); + + ret = dict_set_uint64(dict, "lookups", lookup); + if (ret) + gf_log(THIS->name, GF_LOG_WARNING, "failed to set lookedup file count"); + + ret = dict_set_int32(dict, "status", defrag->defrag_status); + if (ret) + gf_log(THIS->name, GF_LOG_WARNING, "failed to set status"); + + ret = dict_set_double(dict, "run-time", elapsed); + if (ret) + gf_log(THIS->name, GF_LOG_WARNING, "failed to set run-time"); + + ret = dict_set_uint64(dict, "failures", failures); + if (ret) + gf_log(THIS->name, GF_LOG_WARNING, "failed to set failure count"); + + ret = dict_set_uint64(dict, "skipped", skipped); + if (ret) + gf_log(THIS->name, GF_LOG_WARNING, "failed to set skipped file count"); + + ret = dict_set_uint64(dict, "time-left", time_left); + if (ret) + gf_log(THIS->name, GF_LOG_WARNING, "failed to set time-left"); + +log: + switch (defrag->defrag_status) { + case GF_DEFRAG_STATUS_NOT_STARTED: + status = "not started"; + break; + case GF_DEFRAG_STATUS_STARTED: + status = "in progress"; + break; + case GF_DEFRAG_STATUS_STOPPED: + status = "stopped"; + break; + case GF_DEFRAG_STATUS_COMPLETE: + status = "completed"; + break; + case GF_DEFRAG_STATUS_FAILED: + status = "failed"; + break; + default: + break; + } + + gf_msg(THIS->name, GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STATUS, + "Rebalance is %s. Time taken is %.2f secs", status, elapsed); + gf_msg(THIS->name, GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STATUS, + "Files migrated: %" PRIu64 ", size: %" PRIu64 ", lookups: %" PRIu64 + ", failures: %" PRIu64 + ", skipped: " + "%" PRIu64, + files, size, lookup, failures, skipped); +out: + return 0; +} + +int +gf_defrag_stop(dht_conf_t *conf, gf_defrag_status_t status, dict_t *output) +{ + /* TODO: set a variable 'stop_defrag' here, it should be checked + in defrag loop */ + int ret = -1; + gf_defrag_info_t *defrag = conf->defrag; + + GF_ASSERT(defrag); + + if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) { + goto out; + } + + gf_msg("", GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STOPPED, + "Received stop command on rebalance"); + defrag->defrag_status = status; + + if (output) + gf_defrag_status_get(conf, output); + ret = 0; +out: + gf_msg_debug("", 0, "Returning %d", ret); + return ret; +} diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c index f6ed8769d8d..d9dbf50492f 100644 --- a/xlators/cluster/dht/src/dht-rename.c +++ b/xlators/cluster/dht/src/dht-rename.c @@ -1,709 +1,1997 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ /* TODO: link(oldpath, newpath) fails if newpath already exists. DHT should * delete the newpath if it gets EEXISTS from link() call. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "xlator.h" #include "dht-common.h" -#include "defaults.h" +#include "dht-lock.h" +#include <glusterfs/defaults.h> +int +dht_rename_unlock(call_frame_t *frame, xlator_t *this); +int32_t +dht_rename_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); int -dht_rename_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *stbuf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) +dht_rename_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; - local = frame->local; - prev = cookie; + local = frame->local; - if (op_ret == -1) { - /* TODO: undo the damage */ + dht_set_fixed_dir_stat(&local->preoldparent); + dht_set_fixed_dir_stat(&local->postoldparent); + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); - gf_log (this->name, GF_LOG_INFO, - "rename %s -> %s on %s failed (%s)", - local->loc.path, local->loc2.path, - prev->this->name, strerror (op_errno)); + if (IA_ISREG(local->stbuf.ia_type)) + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); - local->op_ret = op_ret; - local->op_errno = op_errno; - goto unwind; - } - /* TODO: construct proper stbuf for dir */ - /* - * FIXME: is this the correct way to build stbuf and - * parent bufs? - */ - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->preoldparent, preoldparent, - prev->this); - dht_iatt_merge (this, &local->postoldparent, postoldparent, - prev->this); - dht_iatt_merge (this, &local->preparent, prenewparent, - prev->this); - dht_iatt_merge (this, &local->postparent, postnewparent, - prev->this); + DHT_STACK_UNWIND(rename, frame, local->op_ret, local->op_errno, + &local->stbuf, &local->preoldparent, &local->postoldparent, + &local->preparent, &local->postparent, local->xattr); + return 0; +} +static void +dht_rename_dir_unlock_src(call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; -unwind: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - local->stbuf.ia_ino = local->loc.inode->ino; + local = frame->local; + dht_unlock_namespace(frame, &local->lock[0]); + return; +} - local->preoldparent.ia_ino = local->loc.parent->ino; - local->postoldparent.ia_ino = local->loc.parent->ino; +static void +dht_rename_dir_unlock_dst(call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + int op_ret = -1; + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + + /* Unlock entrylk */ + dht_unlock_entrylk_wrapper(frame, &local->lock[1].ns.directory_ns); + + /* Unlock inodelk */ + op_ret = dht_unlock_inodelk(frame, local->lock[1].ns.parent_layout.locks, + local->lock[1].ns.parent_layout.lk_count, + dht_rename_unlock_cbk); + if (op_ret < 0) { + uuid_utoa_r(local->loc.inode->gfid, src_gfid); + + if (local->loc2.inode) + uuid_utoa_r(local->loc2.inode->gfid, dst_gfid); + + if (IA_ISREG(local->stbuf.ia_type)) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED, + "winding unlock inodelk failed " + "rename (%s:%s:%s %s:%s:%s), " + "stale locks left on bricks", + local->loc.path, src_gfid, local->src_cached->name, + local->loc2.path, dst_gfid, + local->dst_cached ? local->dst_cached->name : NULL); + else + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED, + "winding unlock inodelk failed " + "rename (%s:%s %s:%s), " + "stale locks left on bricks", + local->loc.path, src_gfid, local->loc2.path, dst_gfid); - local->preparent.ia_ino = local->loc2.parent->ino; - local->postparent.ia_ino = local->loc2.parent->ino; + dht_rename_unlock_cbk(frame, NULL, this, 0, 0, NULL); + } - WIPE (&local->preoldparent); - WIPE (&local->postoldparent); - WIPE (&local->preparent); - WIPE (&local->postparent); + return; +} - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preoldparent, - &local->postoldparent, - &local->preparent, &local->postparent); +static int +dht_rename_dir_unlock(call_frame_t *frame, xlator_t *this) +{ + dht_rename_dir_unlock_src(frame, this); + dht_rename_dir_unlock_dst(frame, this); + return 0; +} +int +dht_rename_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *stbuf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + int i = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + int subvol_cnt = -1; + + conf = this->private; + local = frame->local; + prev = cookie; + subvol_cnt = dht_subvol_cnt(this, prev); + local->ret_cache[subvol_cnt] = op_ret; + + if (op_ret == -1) { + gf_uuid_unparse(local->loc.inode->gfid, gfid); + + gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_RENAME_FAILED, + "Rename %s -> %s on %s failed, (gfid = %s)", local->loc.path, + local->loc2.path, prev->name, gfid); + + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + /* TODO: construct proper stbuf for dir */ + /* + * FIXME: is this the correct way to build stbuf and + * parent bufs? + */ + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->preoldparent, preoldparent); + dht_iatt_merge(this, &local->postoldparent, postoldparent); + dht_iatt_merge(this, &local->preparent, prenewparent); + dht_iatt_merge(this, &local->postparent, postnewparent); + +unwind: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + /* We get here with local->call_cnt == 0. Which means + * we are the only one executing this code, there is + * no contention. Therefore it's safe to manipulate or + * deref local->call_cnt directly (without locking). + */ + if (local->ret_cache[conf->subvolume_cnt] == 0) { + /* count errant subvols in last field of ret_cache */ + for (i = 0; i < conf->subvolume_cnt; i++) { + if (local->ret_cache[i] != 0) + ++local->ret_cache[conf->subvolume_cnt]; + } + if (local->ret_cache[conf->subvolume_cnt]) { + /* undoing the damage: + * for all subvolumes, where rename + * succeeded, we perform the reverse operation + */ + for (i = 0; i < conf->subvolume_cnt; i++) { + if (local->ret_cache[i] == 0) + ++local->call_cnt; + } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (local->ret_cache[i]) + continue; + + STACK_WIND(frame, dht_rename_dir_cbk, conf->subvolumes[i], + conf->subvolumes[i]->fops->rename, &local->loc2, + &local->loc, NULL); + } + + return 0; + } } - return 0; -} + WIPE(&local->preoldparent); + WIPE(&local->postoldparent); + WIPE(&local->preparent); + WIPE(&local->postparent); + dht_rename_dir_unlock(frame, this); + } + return 0; +} int -dht_rename_dir_do (call_frame_t *frame, xlator_t *this) +dht_rename_hashed_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *stbuf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int i = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int call_cnt = 0; + xlator_t *prev = NULL; + int i = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + conf = this->private; + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + gf_uuid_unparse(local->loc.inode->gfid, gfid); + + gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_RENAME_FAILED, + "rename %s -> %s on %s failed, (gfid = %s) ", local->loc.path, + local->loc2.path, prev->name, gfid); + + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + /* TODO: construct proper stbuf for dir */ + /* + * FIXME: is this the correct way to build stbuf and + * parent bufs? + */ + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->preoldparent, preoldparent); + dht_iatt_merge(this, &local->postoldparent, postoldparent); + dht_iatt_merge(this, &local->preparent, prenewparent); + dht_iatt_merge(this, &local->postparent, postnewparent); + + call_cnt = local->call_cnt = conf->subvolume_cnt - 1; + + if (!local->call_cnt) + goto unwind; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == local->dst_hashed) + continue; + STACK_WIND_COOKIE( + frame, dht_rename_dir_cbk, conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->rename, &local->loc, &local->loc2, NULL); + if (!--call_cnt) + break; + } + + return 0; +unwind: + WIPE(&local->preoldparent); + WIPE(&local->postoldparent); + WIPE(&local->preparent); + WIPE(&local->postparent); - conf = this->private; - local = frame->local; + dht_rename_dir_unlock(frame, this); + return 0; +} - if (local->op_ret == -1) - goto err; +int +dht_rename_dir_do(call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; - local->call_cnt = conf->subvolume_cnt; - local->op_ret = 0; + local = frame->local; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_rename_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->rename, - &local->loc, &local->loc2); - } + if (local->op_ret == -1) + goto err; - return 0; + local->op_ret = 0; + + STACK_WIND_COOKIE(frame, dht_rename_hashed_dir_cbk, local->dst_hashed, + local->dst_hashed, local->dst_hashed->fops->rename, + &local->loc, &local->loc2, NULL); + return 0; err: - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, NULL, NULL, - NULL, NULL, NULL); - return 0; + dht_rename_dir_unlock(frame, this); + return 0; } - int -dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries) -{ - dht_local_t *local = NULL; - int this_call_cnt = -1; - call_frame_t *prev = NULL; - - local = frame->local; - prev = cookie; - - if (op_ret > 2) { - gf_log (this->name, GF_LOG_TRACE, - "readdir on %s for %s returned %d entries", - prev->this->name, local->loc.path, op_ret); - local->op_ret = -1; - local->op_errno = ENOTEMPTY; - } +dht_rename_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + xlator_t *prev = NULL; - this_call_cnt = dht_frame_return (frame); + local = frame->local; + prev = cookie; - if (is_last_call (this_call_cnt)) { - dht_rename_dir_do (frame, this); - } + if (op_ret > 2) { + gf_msg_trace(this->name, 0, "readdir on %s for %s returned %d entries", + prev->name, local->loc.path, op_ret); + local->op_ret = -1; + local->op_errno = ENOTEMPTY; + } - return 0; -} + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + dht_rename_dir_do(frame, this); + } + + return 0; +} int -dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) +dht_rename_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = -1; - call_frame_t *prev = NULL; - + dht_local_t *local = NULL; + int this_call_cnt = -1; + xlator_t *prev = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; - local = frame->local; - prev = cookie; + local = frame->local; + prev = cookie; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "opendir on %s for %s failed (%s)", - prev->this->name, local->loc.path, - strerror (op_errno)); - goto err; - } + if (op_ret == -1) { + gf_uuid_unparse(local->loc.inode->gfid, gfid); + gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_OPENDIR_FAILED, + "opendir on %s for %s failed,(gfid = %s) ", prev->name, + local->loc.path, gfid); + goto err; + } - STACK_WIND (frame, dht_rename_readdir_cbk, - prev->this, prev->this->fops->readdir, - local->fd, 4096, 0); + fd_bind(fd); + STACK_WIND_COOKIE(frame, dht_rename_readdir_cbk, prev, prev, + prev->fops->readdir, local->fd, 4096, 0, NULL); - return 0; + return 0; err: - this_call_cnt = dht_frame_return (frame); + this_call_cnt = dht_frame_return(frame); - if (is_last_call (this_call_cnt)) { - dht_rename_dir_do (frame, this); - } + if (is_last_call(this_call_cnt)) { + dht_rename_dir_do(frame, this); + } - return 0; + return 0; } - int -dht_rename_dir (call_frame_t *frame, xlator_t *this) +dht_rename_dir_lock2_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - int i = 0; - int op_errno = -1; + dht_local_t *local = NULL; + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + dht_conf_t *conf = NULL; + int i = 0; + + local = frame->local; + conf = this->private; + + if (op_ret < 0) { + uuid_utoa_r(local->loc.inode->gfid, src_gfid); + + if (local->loc2.inode) + uuid_utoa_r(local->loc2.inode->gfid, dst_gfid); + + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR, + "acquiring entrylk after inodelk failed" + "rename (%s:%s:%s %s:%s:%s)", + local->loc.path, src_gfid, local->src_cached->name, + local->loc2.path, dst_gfid, + local->dst_cached ? local->dst_cached->name : NULL); + + local->op_ret = -1; + local->op_errno = op_errno; + goto err; + } + + local->fd = fd_create(local->loc.inode, frame->root->pid); + if (!local->fd) { + op_errno = ENOMEM; + goto err; + } + + local->op_ret = 0; + + if (!local->dst_cached) { + dht_rename_dir_do(frame, this); + return 0; + } + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_rename_opendir_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->opendir, &local->loc2, + local->fd, NULL); + } - conf = frame->this->private; - local = frame->local; + return 0; - local->call_cnt = conf->subvolume_cnt; +err: + /* No harm in calling an extra unlock */ + dht_rename_dir_unlock(frame, this); + return 0; +} - for (i = 0; i < conf->subvolume_cnt; i++) { - if (!conf->subvolume_status[i]) { - gf_log (this->name, GF_LOG_INFO, - "one of the subvolumes down (%s)", - conf->subvolumes[i]->name); - op_errno = ENOTCONN; - goto err; - } - } +int +dht_rename_dir_lock1_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + int ret = 0; + loc_t *loc = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + if (op_ret < 0) { + uuid_utoa_r(local->loc.inode->gfid, src_gfid); + + if (local->loc2.inode) + uuid_utoa_r(local->loc2.inode->gfid, dst_gfid); + + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR, + "acquiring entrylk after inodelk failed" + "rename (%s:%s:%s %s:%s:%s)", + local->loc.path, src_gfid, local->src_cached->name, + local->loc2.path, dst_gfid, + local->dst_cached ? local->dst_cached->name : NULL); + + local->op_ret = -1; + local->op_errno = op_errno; + goto err; + } + + if (local->current == &local->lock[0]) { + loc = &local->loc2; + subvol = local->dst_hashed; + local->current = &local->lock[1]; + } else { + loc = &local->loc; + subvol = local->src_hashed; + local->current = &local->lock[0]; + } + ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns, + dht_rename_dir_lock2_cbk); + if (ret < 0) { + op_errno = EINVAL; + goto err; + } + + return 0; +err: + /* No harm in calling an extra unlock */ + dht_rename_dir_unlock(frame, this); + return 0; +} - local->fd = fd_create (local->loc.inode, frame->root->pid); - if (!local->fd) { - op_errno = ENOMEM; - goto err; +/* + * If the hashed subvolumes of both source and dst are the different, + * lock in dictionary order of hashed subvol->name. This is important + * in case the parent directory is the same for both src and dst to + * prevent inodelk deadlocks when racing with a fix-layout op on the parent. + * + * If the hashed subvols are the same, use the gfid/name to determine + * the order of taking locks to prevent entrylk deadlocks when the parent + * dirs are the same. + * + */ +static int +dht_order_rename_lock(call_frame_t *frame, loc_t **loc, xlator_t **subvol) +{ + int ret = 0; + int op_ret = 0; + dht_local_t *local = NULL; + char *src = NULL; + char *dst = NULL; + + local = frame->local; + + if (local->src_hashed->name == local->dst_hashed->name) { + ret = 0; + } else { + ret = strcmp(local->src_hashed->name, local->dst_hashed->name); + } + + if (ret == 0) { + /* hashed subvols are the same for src and dst */ + /* Entrylks need to be ordered*/ + + src = alloca(GF_UUID_BNAME_BUF_SIZE + strlen(local->loc.name) + 1); + if (!src) { + gf_msg(frame->this->name, GF_LOG_ERROR, ENOMEM, 0, + "Insufficient memory for src"); + op_ret = -1; + goto out; } - local->op_ret = 0; + if (!gf_uuid_is_null(local->loc.pargfid)) + uuid_utoa_r(local->loc.pargfid, src); + else if (local->loc.parent) + uuid_utoa_r(local->loc.parent->gfid, src); + else + src[0] = '\0'; - if (!local->dst_cached) { - dht_rename_dir_do (frame, this); - return 0; - } + strcat(src, local->loc.name); - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_rename_opendir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->opendir, - &local->loc2, local->fd); + dst = alloca(GF_UUID_BNAME_BUF_SIZE + strlen(local->loc2.name) + 1); + if (!dst) { + gf_msg(frame->this->name, GF_LOG_ERROR, ENOMEM, 0, + "Insufficient memory for dst"); + op_ret = -1; + goto out; } - return 0; + if (!gf_uuid_is_null(local->loc2.pargfid)) + uuid_utoa_r(local->loc2.pargfid, dst); + else if (local->loc2.parent) + uuid_utoa_r(local->loc2.parent->gfid, dst); + else + dst[0] = '\0'; + + strcat(dst, local->loc2.name); + ret = strcmp(src, dst); + } + + if (ret <= 0) { + /*inodelk in dictionary order of hashed subvol names*/ + /*entrylk in dictionary order of gfid/basename */ + local->current = &local->lock[0]; + *loc = &local->loc; + *subvol = local->src_hashed; + + } else { + local->current = &local->lock[1]; + *loc = &local->loc2; + *subvol = local->dst_hashed; + } + + op_ret = 0; + +out: + return op_ret; +} + +int +dht_rename_dir(call_frame_t *frame, xlator_t *this) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; + xlator_t *subvol = NULL; + int i = 0; + int ret = 0; + int op_errno = -1; + + conf = frame->this->private; + local = frame->local; + + local->ret_cache = GF_CALLOC(conf->subvolume_cnt + 1, sizeof(int), + gf_dht_ret_cache_t); + + if (local->ret_cache == NULL) { + op_errno = ENOMEM; + goto err; + } + + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->subvolume_status[i]) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED, + "Rename dir failed: subvolume down (%s)", + conf->subvolumes[i]->name); + op_errno = ENOTCONN; + goto err; + } + } + + /* Locks on src and dst needs to ordered which otherwise might cause + * deadlocks when rename (src, dst) and rename (dst, src) is done from + * two different clients + */ + ret = dht_order_rename_lock(frame, &loc, &subvol); + if (ret) { + op_errno = ENOMEM; + goto err; + } + + /* Rename must take locks on src to avoid lookup selfheal from + * recreating src on those subvols where the rename was successful. + * The locks can't be issued parallel as two different clients might + * attempt same rename command and be in dead lock. + */ + ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns, + dht_rename_dir_lock1_cbk); + if (ret < 0) { + op_errno = EINVAL; + goto err; + } + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - return 0; + DHT_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; +} + +static int +dht_rename_track_for_changelog(xlator_t *this, dict_t *xattr, loc_t *oldloc, + loc_t *newloc) +{ + int ret = -1; + dht_changelog_rename_info_t *info = NULL; + char *name = NULL; + int len1 = 0; + int len2 = 0; + int size = 0; + + if (!xattr || !oldloc || !newloc || !this) + return ret; + + len1 = strlen(oldloc->name) + 1; + len2 = strlen(newloc->name) + 1; + size = sizeof(dht_changelog_rename_info_t) + len1 + len2; + + info = GF_CALLOC(size, sizeof(char), gf_common_mt_char); + if (!info) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to calloc memory"); + return ret; + } + + gf_uuid_copy(info->old_pargfid, oldloc->pargfid); + gf_uuid_copy(info->new_pargfid, newloc->pargfid); + + info->oldname_len = len1; + info->newname_len = len2; + strncpy(info->buffer, oldloc->name, len1); + name = info->buffer + len1; + strncpy(name, newloc->name, len2); + + ret = dict_set_bin(xattr, DHT_CHANGELOG_RENAME_OP_KEY, info, size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value: key = %s," + " path = %s", + DHT_CHANGELOG_RENAME_OP_KEY, oldloc->name); + GF_FREE(info); + } + + return ret; +} + +#define DHT_MARKER_DONT_ACCOUNT(xattr) \ + do { \ + int tmp = -1; \ + if (!xattr) { \ + xattr = dict_new(); \ + if (!xattr) \ + break; \ + } \ + tmp = dict_set_str(xattr, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY, "yes"); \ + if (tmp) { \ + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, \ + "Failed to set dictionary value: key = %s," \ + " path = %s", \ + GLUSTERFS_MARKER_DONT_ACCOUNT_KEY, local->loc.path); \ + } \ + } while (0) + +#define DHT_CHANGELOG_TRACK_AS_RENAME(xattr, oldloc, newloc) \ + do { \ + int tmp = -1; \ + if (!xattr) { \ + xattr = dict_new(); \ + if (!xattr) { \ + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, \ + "Failed to create dictionary to " \ + "track rename"); \ + break; \ + } \ + } \ + \ + tmp = dht_rename_track_for_changelog(this, xattr, oldloc, newloc); \ + \ + if (tmp) { \ + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, \ + "Failed to set dictionary value: key = %s," \ + " path = %s", \ + DHT_CHANGELOG_RENAME_OP_KEY, (oldloc)->path); \ + } \ + } while (0) + +int +dht_rename_unlock(call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + int op_ret = -1; + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + dht_ilock_wrap_t inodelk_wrapper = { + 0, + }; + + local = frame->local; + inodelk_wrapper.locks = local->rename_inodelk_backward_compatible; + inodelk_wrapper.lk_count = local->rename_inodelk_bc_count; + + op_ret = dht_unlock_inodelk_wrapper(frame, &inodelk_wrapper); + if (op_ret < 0) { + uuid_utoa_r(local->loc.inode->gfid, src_gfid); + + if (local->loc2.inode) + uuid_utoa_r(local->loc2.inode->gfid, dst_gfid); + + if (IA_ISREG(local->stbuf.ia_type)) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED, + "winding unlock inodelk failed " + "rename (%s:%s:%s %s:%s:%s), " + "stale locks left on bricks", + local->loc.path, src_gfid, local->src_cached->name, + local->loc2.path, dst_gfid, + local->dst_cached ? local->dst_cached->name : NULL); + else + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED, + "winding unlock inodelk failed " + "rename (%s:%s %s:%s), " + "stale locks left on bricks", + local->loc.path, src_gfid, local->loc2.path, dst_gfid); + } + + dht_unlock_namespace(frame, &local->lock[0]); + dht_unlock_namespace(frame, &local->lock[1]); + + dht_rename_unlock_cbk(frame, NULL, this, local->op_ret, local->op_errno, + NULL); + return 0; } +int +dht_rename_done(call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + + local = frame->local; + + if (local->linked == _gf_true) { + local->linked = _gf_false; + dht_linkfile_attr_heal(frame, this); + } + + dht_rename_unlock(frame, this); + return 0; +} int -dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) +dht_rename_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - int this_call_cnt = 0; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int this_call_cnt = 0; - local = frame->local; - prev = cookie; + local = frame->local; + prev = cookie; - if (!local) { - gf_log (this->name, GF_LOG_ERROR, - "!local, should not happen"); - goto out; - } + FRAME_SU_UNDO(frame, dht_local_t); + if (!local) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_VALUE, + "!local, should not happen"); + goto out; + } - this_call_cnt = dht_frame_return (frame); + this_call_cnt = dht_frame_return(frame); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "%s: unlink on %s failed (%s)", - local->loc.path, prev->this->name, strerror (op_errno)); - } + if (op_ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLINK_FAILED, + "%s: Rename: unlink on %s failed ", local->loc.path, prev->name); + } - WIPE (&local->preoldparent); - WIPE (&local->postoldparent); - WIPE (&local->preparent); - WIPE (&local->postparent); + WIPE(&local->preoldparent); + WIPE(&local->postoldparent); + WIPE(&local->preparent); + WIPE(&local->postparent); - if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preoldparent, - &local->postoldparent, &local->preparent, - &local->postparent); - } + if (is_last_call(this_call_cnt)) { + dht_rename_done(frame, this); + } out: - return 0; + return 0; } - int -dht_rename_cleanup (call_frame_t *frame) +dht_rename_cleanup(call_frame_t *frame) { - dht_local_t *local = NULL; - xlator_t *this = NULL; - xlator_t *src_hashed = NULL; - xlator_t *src_cached = NULL; - xlator_t *dst_hashed = NULL; - xlator_t *dst_cached = NULL; - int call_cnt = 0; + dht_local_t *local = NULL; + xlator_t *this = NULL; + xlator_t *src_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + int call_cnt = 0; + dict_t *xattr = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + local = frame->local; + this = frame->this; - local = frame->local; - this = frame->this; + src_hashed = local->src_hashed; + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; - src_hashed = local->src_hashed; - src_cached = local->src_cached; - dst_hashed = local->dst_hashed; - dst_cached = local->dst_cached; + if (src_cached == dst_cached) + goto nolinks; - if (src_cached == dst_cached) - goto nolinks; + if (local->linked && (dst_hashed != src_hashed) && + (dst_hashed != src_cached)) { + call_cnt++; + } - if (dst_hashed != src_hashed && dst_hashed != src_cached) - call_cnt++; + if (local->added_link && (src_cached != dst_hashed)) { + call_cnt++; + } - if (src_cached != dst_hashed) - call_cnt++; + local->call_cnt = call_cnt; - local->call_cnt = call_cnt; + if (!call_cnt) + goto nolinks; - if (!call_cnt) - goto nolinks; + DHT_MARK_FOP_INTERNAL(xattr); - if (dst_hashed != src_hashed && dst_hashed != src_cached) { - gf_log (this->name, GF_LOG_TRACE, - "unlinking linkfile %s @ %s => %s", - local->loc.path, dst_hashed->name, src_cached->name); - STACK_WIND (frame, dht_rename_unlink_cbk, - dst_hashed, dst_hashed->fops->unlink, - &local->loc); - } + gf_uuid_unparse(local->loc.inode->gfid, gfid); - if (src_cached != dst_hashed) { - gf_log (this->name, GF_LOG_TRACE, - "unlinking link %s => %s (%s)", local->loc.path, - local->loc2.path, src_cached->name); - STACK_WIND (frame, dht_rename_unlink_cbk, - src_cached, src_cached->fops->unlink, - &local->loc2); + if (local->linked && (dst_hashed != src_hashed) && + (dst_hashed != src_cached)) { + dict_t *xattr_new = NULL; + + gf_msg_trace(this->name, 0, + "unlinking linkfile %s @ %s => %s, (gfid = %s)", + local->loc.path, dst_hashed->name, src_cached->name, gfid); + + xattr_new = dict_copy_with_ref(xattr, NULL); + + DHT_MARKER_DONT_ACCOUNT(xattr_new); + + FRAME_SU_DO(frame, dht_local_t); + STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, dst_hashed, dst_hashed, + dst_hashed->fops->unlink, &local->loc, 0, xattr_new); + + dict_unref(xattr_new); + xattr_new = NULL; + } + + if (local->added_link && (src_cached != dst_hashed)) { + dict_t *xattr_new = NULL; + + gf_msg_trace(this->name, 0, "unlinking link %s => %s (%s), (gfid = %s)", + local->loc.path, local->loc2.path, src_cached->name, gfid); + + xattr_new = dict_copy_with_ref(xattr, NULL); + + if (gf_uuid_compare(local->loc.pargfid, local->loc2.pargfid) == 0) { + DHT_MARKER_DONT_ACCOUNT(xattr_new); } + /* * + * The link to file is created using root permission. + * Hence deletion should happen using root. Otherwise + * it will fail. + */ + FRAME_SU_DO(frame, dht_local_t); + STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, src_cached, src_cached, + src_cached->fops->unlink, &local->loc2, 0, xattr_new); - return 0; + dict_unref(xattr_new); + xattr_new = NULL; + } -nolinks: - WIPE (&local->preoldparent); - WIPE (&local->postoldparent); - WIPE (&local->preparent); - WIPE (&local->postparent); + if (xattr) + dict_unref(xattr); - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preoldparent, - &local->postoldparent, &local->preparent, - &local->postparent); + return 0; - return 0; -} +nolinks: + WIPE(&local->preoldparent); + WIPE(&local->postoldparent); + WIPE(&local->preparent); + WIPE(&local->postparent); + dht_rename_unlock(frame, this); + return 0; +} int -dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *stbuf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) -{ - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - xlator_t *src_hashed = NULL; - xlator_t *src_cached = NULL; - xlator_t *dst_hashed = NULL; - xlator_t *dst_cached = NULL; - xlator_t *rename_subvol = NULL; - - local = frame->local; - prev = cookie; - - src_hashed = local->src_hashed; - src_cached = local->src_cached; - dst_hashed = local->dst_hashed; - dst_cached = local->dst_cached; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "%s: rename on %s failed (%s)", local->loc.path, - prev->this->name, strerror (op_errno)); - local->op_ret = op_ret; - local->op_errno = op_errno; - goto cleanup; - } +dht_rename_unlink(call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + xlator_t *src_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + xlator_t *rename_subvol = NULL; + dict_t *xattr = NULL; - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->preoldparent, preoldparent, prev->this); - dht_iatt_merge (this, &local->postoldparent, postoldparent, prev->this); - dht_iatt_merge (this, &local->preparent, prenewparent, prev->this); - dht_iatt_merge (this, &local->postparent, postnewparent, prev->this); + local = frame->local; - local->stbuf.ia_ino = local->loc.inode->ino; + src_hashed = local->src_hashed; + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; - local->preoldparent.ia_ino = local->loc.parent->ino; - local->postoldparent.ia_ino = local->loc.parent->ino; + local->call_cnt = 0; - local->preparent.ia_ino = local->loc2.parent->ino; - local->postparent.ia_ino = local->loc2.parent->ino; + /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk + * is called. since rename has already happened on rename_subvol, + * unlink shouldn't be sent for oldpath (either linkfile or cached-file) + * on rename_subvol. */ + if (src_cached == dst_cached) + rename_subvol = src_cached; + else + rename_subvol = dst_hashed; - /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk - * is called. since rename has already happened on rename_subvol, - * unlink should not be sent for oldpath (either linkfile or cached-file) - * on rename_subvol. */ - if (src_cached == dst_cached) - rename_subvol = src_cached; - else - rename_subvol = dst_hashed; + /* TODO: delete files in background */ - /* TODO: delete files in background */ + if (src_cached != dst_hashed && src_cached != dst_cached) + local->call_cnt++; - if (src_cached != dst_hashed && src_cached != dst_cached) - local->call_cnt++; + if (src_hashed != rename_subvol && src_hashed != src_cached) + local->call_cnt++; - if (src_hashed != rename_subvol && src_hashed != src_cached) - local->call_cnt++; + if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached) + local->call_cnt++; - if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached) - local->call_cnt++; + if (local->call_cnt == 0) + goto unwind; - if (local->call_cnt == 0) - goto unwind; + DHT_MARK_FOP_INTERNAL(xattr); - if (src_cached != dst_hashed && src_cached != dst_cached) { - gf_log (this->name, GF_LOG_TRACE, - "deleting old src datafile %s @ %s", - local->loc.path, src_cached->name); + if (src_cached != dst_hashed && src_cached != dst_cached) { + dict_t *xattr_new = NULL; - STACK_WIND (frame, dht_rename_unlink_cbk, - src_cached, src_cached->fops->unlink, - &local->loc); - } + xattr_new = dict_copy_with_ref(xattr, NULL); - if (src_hashed != rename_subvol && src_hashed != src_cached) { - gf_log (this->name, GF_LOG_TRACE, - "deleting old src linkfile %s @ %s", - local->loc.path, src_hashed->name); + gf_msg_trace(this->name, 0, "deleting old src datafile %s @ %s", + local->loc.path, src_cached->name); - STACK_WIND (frame, dht_rename_unlink_cbk, - src_hashed, src_hashed->fops->unlink, - &local->loc); + if (gf_uuid_compare(local->loc.pargfid, local->loc2.pargfid) == 0) { + DHT_MARKER_DONT_ACCOUNT(xattr_new); } - if (dst_cached - && (dst_cached != dst_hashed) - && (dst_cached != src_cached)) { - gf_log (this->name, GF_LOG_TRACE, - "deleting old dst datafile %s @ %s", - local->loc2.path, dst_cached->name); + DHT_CHANGELOG_TRACK_AS_RENAME(xattr_new, &local->loc, &local->loc2); + STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, src_cached, src_cached, + src_cached->fops->unlink, &local->loc, 0, xattr_new); - STACK_WIND (frame, dht_rename_unlink_cbk, - dst_cached, dst_cached->fops->unlink, - &local->loc2); - } - return 0; + dict_unref(xattr_new); + xattr_new = NULL; + } + + if (src_hashed != rename_subvol && src_hashed != src_cached) { + dict_t *xattr_new = NULL; + + xattr_new = dict_copy_with_ref(xattr, NULL); + + gf_msg_trace(this->name, 0, "deleting old src linkfile %s @ %s", + local->loc.path, src_hashed->name); + + DHT_MARKER_DONT_ACCOUNT(xattr_new); + + STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, src_hashed, src_hashed, + src_hashed->fops->unlink, &local->loc, 0, xattr_new); + + dict_unref(xattr_new); + xattr_new = NULL; + } + + if (dst_cached && (dst_cached != dst_hashed) && + (dst_cached != src_cached)) { + gf_msg_trace(this->name, 0, "deleting old dst datafile %s @ %s", + local->loc2.path, dst_cached->name); + + STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, dst_cached, dst_cached, + dst_cached->fops->unlink, &local->loc2, 0, xattr); + } + if (xattr) + dict_unref(xattr); + return 0; unwind: - WIPE (&local->preoldparent); - WIPE (&local->postoldparent); - WIPE (&local->preparent); - WIPE (&local->postparent); + WIPE(&local->preoldparent); + WIPE(&local->postoldparent); + WIPE(&local->preparent); + WIPE(&local->postparent); + + dht_rename_done(frame, this); + + return 0; +} + +int +dht_rename_links_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + xlator_t *prev = NULL; + dht_local_t *local = NULL; + call_frame_t *main_frame = NULL; + + prev = cookie; + local = frame->local; + main_frame = local->main_frame; + + /* TODO: Handle this case in lookup-optimize */ + if (op_ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_CREATE_LINK_FAILED, + "link/file %s on %s failed", local->loc.path, prev->name); + } + + if (local->linked == _gf_true) { + local->linked = _gf_false; + dht_linkfile_attr_heal(frame, this); + } + + dht_rename_unlink(main_frame, this); + DHT_STACK_DESTROY(frame); + return 0; +} + +int +dht_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *stbuf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + call_frame_t *link_frame = NULL; + dht_local_t *link_local = NULL; + + local = frame->local; + prev = cookie; + + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; + + if (local->linked == _gf_true) + FRAME_SU_UNDO(frame, dht_local_t); + + /* It is a critical failure iff we fail to rename the cached file + * if the rename of the linkto failed, it is not a critical failure, + * and we do not want to lose the created hard link for the new + * name as that could have been read by other clients. + * + * NOTE: If another client is attempting the same oldname -> newname + * rename, and finds both file names as existing, and are hard links + * to each other, then FUSE would send in an unlink for oldname. In + * this time duration if we treat the linkto as a critical error and + * unlink the newname we created, we would have effectively lost the + * file to rename operations. + * + * Repercussions of treating this as a non-critical error is that + * we could leave behind a stale linkto file and/or not create the new + * linkto file, the second case would be rectified by a subsequent + * lookup, the first case by a rebalance, like for all stale linkto + * files */ + + if (op_ret == -1) { + /* Critical failure: unable to rename the cached file */ + if (prev == src_cached) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_RENAME_FAILED, + "%s: Rename on %s failed, (gfid = %s) ", local->loc.path, + prev->name, + local->loc.inode ? uuid_utoa(local->loc.inode->gfid) : ""); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto cleanup; + } else { + /* Non-critical failure, unable to rename the linkto + * file + */ + gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_RENAME_FAILED, + "%s: Rename (linkto file) on %s failed, " + "(gfid = %s) ", + local->loc.path, prev->name, + local->loc.inode ? uuid_utoa(local->loc.inode->gfid) : ""); + } + } + if (xdata) { + if (!local->xattr) + local->xattr = dict_ref(xdata); + else + local->xattr = dict_copy_with_ref(xdata, local->xattr); + } + + /* Merge attrs only from src_cached. In case there of src_cached != + * dst_hashed, this ignores linkfile attrs. */ + if (prev == src_cached) { + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->preoldparent, preoldparent); + dht_iatt_merge(this, &local->postoldparent, postoldparent); + dht_iatt_merge(this, &local->preparent, prenewparent); + dht_iatt_merge(this, &local->postparent, postnewparent); + } + + /* Create the linkto file for the dst file */ + if ((src_cached == dst_cached) && (dst_hashed != dst_cached)) { + link_frame = copy_frame(frame); + if (!link_frame) { + goto unlink; + } + + /* fop value sent as maxvalue because it is not used + * anywhere in this case */ + link_local = dht_local_init(link_frame, &local->loc2, NULL, + GF_FOP_MAXVALUE); + if (!link_local) { + goto unlink; + } - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preoldparent, - &local->postoldparent, &local->preparent, - &local->postparent); + if (link_local->loc.inode) + inode_unref(link_local->loc.inode); + link_local->loc.inode = inode_ref(local->loc.inode); + link_local->main_frame = frame; + link_local->stbuf = local->stbuf; + gf_uuid_copy(link_local->gfid, local->loc.inode->gfid); + dht_linkfile_create(link_frame, dht_rename_links_create_cbk, this, + src_cached, dst_hashed, &link_local->loc); return 0; + } + +unlink: + + if (link_frame) { + DHT_STACK_DESTROY(link_frame); + } + dht_rename_unlink(frame, this); + return 0; cleanup: - dht_rename_cleanup (frame); + dht_rename_cleanup(frame); - return 0; + return 0; } +int +dht_do_rename(call_frame_t *frame) +{ + dht_local_t *local = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_cached = NULL; + xlator_t *this = NULL; + xlator_t *rename_subvol = NULL; + + local = frame->local; + this = frame->this; + + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; + src_cached = local->src_cached; + + if (src_cached == dst_cached) + rename_subvol = src_cached; + else + rename_subvol = dst_hashed; + + if ((src_cached != dst_hashed) && (rename_subvol == dst_hashed)) { + DHT_MARKER_DONT_ACCOUNT(local->xattr_req); + } + + if (rename_subvol == src_cached) { + DHT_CHANGELOG_TRACK_AS_RENAME(local->xattr_req, &local->loc, + &local->loc2); + } + + gf_msg_trace(this->name, 0, "renaming %s => %s (%s)", local->loc.path, + local->loc2.path, rename_subvol->name); + + if (local->linked == _gf_true) + FRAME_SU_DO(frame, dht_local_t); + STACK_WIND_COOKIE(frame, dht_rename_cbk, rename_subvol, rename_subvol, + rename_subvol->fops->rename, &local->loc, &local->loc2, + local->xattr_req); + return 0; +} int -dht_do_rename (call_frame_t *frame) +dht_rename_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; - xlator_t *dst_hashed = NULL; - xlator_t *src_cached = NULL; - xlator_t *dst_cached = NULL; - xlator_t *this = NULL; - xlator_t *rename_subvol = NULL; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + local = frame->local; + prev = cookie; - local = frame->local; - this = frame->this; + if (op_ret == -1) { + gf_msg_debug(this->name, 0, "link/file on %s failed (%s)", prev->name, + strerror(op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + local->added_link = _gf_false; + } else + dht_iatt_merge(this, &local->stbuf, stbuf); - dst_hashed = local->dst_hashed; - dst_cached = local->dst_cached; - src_cached = local->src_cached; + if (local->op_ret == -1) + goto cleanup; - if (src_cached == dst_cached) - rename_subvol = src_cached; - else - rename_subvol = dst_hashed; + dht_do_rename(frame); - gf_log (this->name, GF_LOG_TRACE, - "renaming %s => %s (%s)", - local->loc.path, local->loc2.path, rename_subvol->name); + return 0; - STACK_WIND (frame, dht_rename_cbk, - rename_subvol, rename_subvol->fops->rename, - &local->loc, &local->loc2); +cleanup: + dht_rename_cleanup(frame); - return 0; + return 0; } - int -dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) +dht_rename_linkto_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - int this_call_cnt = 0; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + xlator_t *src_cached = NULL; + dict_t *xattr = NULL; + local = frame->local; + DHT_MARK_FOP_INTERNAL(xattr); + prev = cookie; + src_cached = local->src_cached; - local = frame->local; - prev = cookie; + if (op_ret == -1) { + gf_msg_debug(this->name, 0, "link/file on %s failed (%s)", prev->name, + strerror(op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "link/file on %s failed (%s)", - prev->this->name, strerror (op_errno)); - local->op_ret = -1; - local->op_errno = op_errno; - } + /* If linkto creation failed move to failure cleanup code, + * instead of continuing with creating the link file */ + if (local->op_ret != 0) { + goto cleanup; + } - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - if (local->op_ret == -1) - goto cleanup; + gf_msg_trace(this->name, 0, "link %s => %s (%s)", local->loc.path, + local->loc2.path, src_cached->name); + if (gf_uuid_compare(local->loc.pargfid, local->loc2.pargfid) == 0) { + DHT_MARKER_DONT_ACCOUNT(xattr); + } - dht_do_rename (frame); - } + local->added_link = _gf_true; - return 0; + STACK_WIND_COOKIE(frame, dht_rename_link_cbk, src_cached, src_cached, + src_cached->fops->link, &local->loc, &local->loc2, xattr); + + if (xattr) + dict_unref(xattr); + + return 0; cleanup: - dht_rename_cleanup (frame); + dht_rename_cleanup(frame); - return 0; + if (xattr) + dict_unref(xattr); + + return 0; } +int +dht_rename_unlink_links_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + gf_msg_debug(this->name, 0, "unlink of %s on %s failed (%s)", + local->loc2.path, prev->name, strerror(op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } + + if (local->op_ret == -1) + goto cleanup; + + dht_do_rename(frame); + + return 0; + +cleanup: + dht_rename_cleanup(frame); + + return 0; +} int -dht_rename_create_links (call_frame_t *frame) +dht_rename_create_links(call_frame_t *frame) { - dht_local_t *local = NULL; - xlator_t *this = NULL; - xlator_t *src_hashed = NULL; - xlator_t *src_cached = NULL; - xlator_t *dst_hashed = NULL; - xlator_t *dst_cached = NULL; - int call_cnt = 0; + dht_local_t *local = NULL; + xlator_t *this = NULL; + xlator_t *src_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + int call_cnt = 0; + dict_t *xattr = NULL; + local = frame->local; + this = frame->this; - local = frame->local; - this = frame->this; + src_hashed = local->src_hashed; + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; - src_hashed = local->src_hashed; - src_cached = local->src_cached; - dst_hashed = local->dst_hashed; - dst_cached = local->dst_cached; + DHT_MARK_FOP_INTERNAL(xattr); - if (src_cached == dst_cached) - goto nolinks; + if (src_cached == dst_cached) { + dict_t *xattr_new = NULL; - if (dst_hashed != src_hashed && dst_hashed != src_cached) - call_cnt++; + if (dst_hashed == dst_cached) + goto nolinks; - if (src_cached != dst_hashed) - call_cnt++; + xattr_new = dict_copy_with_ref(xattr, NULL); - local->call_cnt = call_cnt; + gf_msg_trace(this->name, 0, "unlinking dst linkfile %s @ %s", + local->loc2.path, dst_hashed->name); - if (dst_hashed != src_hashed && dst_hashed != src_cached) { - gf_log (this->name, GF_LOG_TRACE, - "linkfile %s @ %s => %s", - local->loc.path, dst_hashed->name, src_cached->name); - memcpy (local->gfid, local->loc.inode->gfid, 16); - dht_linkfile_create (frame, dht_rename_links_cbk, - src_cached, dst_hashed, &local->loc); - } + DHT_MARKER_DONT_ACCOUNT(xattr_new); + + STACK_WIND_COOKIE(frame, dht_rename_unlink_links_cbk, dst_hashed, + dst_hashed, dst_hashed->fops->unlink, &local->loc2, 0, + xattr_new); - if (src_cached != dst_hashed) { - gf_log (this->name, GF_LOG_TRACE, - "link %s => %s (%s)", local->loc.path, - local->loc2.path, src_cached->name); - STACK_WIND (frame, dht_rename_links_cbk, - src_cached, src_cached->fops->link, - &local->loc, &local->loc2); + dict_unref(xattr_new); + if (xattr) + dict_unref(xattr); + + return 0; + } + + if (src_cached != dst_hashed) { + /* needed to create the link file */ + call_cnt++; + if (dst_hashed != src_hashed) + /* needed to create the linkto file */ + call_cnt++; + } + + /* We should not have any failures post the link creation, as this + * introduces the newname into the namespace. Clients could have cached + * the existence of the newname and may start taking actions based on + * the same. Hence create the linkto first, and then attempt the link. + * + * NOTE: If another client is attempting the same oldname -> newname + * rename, and finds both file names as existing, and are hard links + * to each other, then FUSE would send in an unlink for oldname. In + * this time duration if we treat the linkto as a critical error and + * unlink the newname we created, we would have effectively lost the + * file to rename operations. */ + if (dst_hashed != src_hashed && src_cached != dst_hashed) { + gf_msg_trace(this->name, 0, "linkfile %s @ %s => %s", local->loc.path, + dst_hashed->name, src_cached->name); + + memcpy(local->gfid, local->loc.inode->gfid, 16); + dht_linkfile_create(frame, dht_rename_linkto_cbk, this, src_cached, + dst_hashed, &local->loc); + } else if (src_cached != dst_hashed) { + dict_t *xattr_new = NULL; + + xattr_new = dict_copy_with_ref(xattr, NULL); + + gf_msg_trace(this->name, 0, "link %s => %s (%s)", local->loc.path, + local->loc2.path, src_cached->name); + if (gf_uuid_compare(local->loc.pargfid, local->loc2.pargfid) == 0) { + DHT_MARKER_DONT_ACCOUNT(xattr_new); } + local->added_link = _gf_true; + + STACK_WIND_COOKIE(frame, dht_rename_link_cbk, src_cached, src_cached, + src_cached->fops->link, &local->loc, &local->loc2, + xattr_new); + + dict_unref(xattr_new); + } + nolinks: - if (!call_cnt) { - /* skip to next step */ - dht_do_rename (frame); + if (!call_cnt) { + /* skip to next step */ + dht_do_rename(frame); + } + if (xattr) + dict_unref(xattr); + + return 0; +} + +int +dht_rename_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) +{ + dht_local_t *local = NULL; + int call_cnt = 0; + dht_conf_t *conf = NULL; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + char gfid_server[GF_UUID_BUF_SIZE] = {0}; + int child_index = -1; + gf_boolean_t is_src = _gf_false; + loc_t *loc = NULL; + + child_index = (long)cookie; + + local = frame->local; + conf = this->private; + + is_src = (child_index == 0); + if (is_src) + loc = &local->loc; + else + loc = &local->loc2; + + if (op_ret >= 0) { + if (is_src) + local->src_cached = dht_subvol_get_cached(this, local->loc.inode); + else { + if (loc->inode) + gf_uuid_unparse(loc->inode->gfid, gfid_local); + + gf_msg_debug(this->name, 0, + "dst_cached before lookup: %s, " + "(path:%s)(gfid:%s),", + local->loc2.path, + local->dst_cached ? local->dst_cached->name : NULL, + local->dst_cached ? gfid_local : NULL); + + local->dst_cached = dht_subvol_get_cached(this, + local->loc2_copy.inode); + + gf_uuid_unparse(stbuf->ia_gfid, gfid_local); + + gf_msg_debug(this->name, GF_LOG_WARNING, + "dst_cached after lookup: %s, " + "(path:%s)(gfid:%s)", + local->loc2.path, + local->dst_cached ? local->dst_cached->name : NULL, + local->dst_cached ? gfid_local : NULL); + + if ((local->loc2.inode == NULL) || + gf_uuid_compare(stbuf->ia_gfid, local->loc2.inode->gfid)) { + if (local->loc2.inode != NULL) { + inode_unlink(local->loc2.inode, local->loc2.parent, + local->loc2.name); + inode_unref(local->loc2.inode); + } + + local->loc2.inode = inode_link(local->loc2_copy.inode, + local->loc2_copy.parent, + local->loc2_copy.name, stbuf); + gf_uuid_copy(local->loc2.gfid, stbuf->ia_gfid); + } + } + } + + if (op_ret < 0) { + if (is_src) { + /* The meaning of is_linkfile is overloaded here. For locking + * to work properly both rebalance and rename should acquire + * lock on datafile. The reason for sending this lookup is to + * find out whether we've acquired a lock on data file. + * Between the lookup before rename and this rename, the + * file could be migrated by a rebalance process and now this + * file this might be a linkto file. We verify that by sending + * this lookup. However, if this lookup fails we cannot really + * say whether we've acquired lock on a datafile or linkto file. + * So, we act conservatively and _assume_ + * that this is a linkfile and fail the rename operation. + */ + local->is_linkfile = _gf_true; + local->op_errno = op_errno; + } else { + if (local->dst_cached) + gf_msg_debug(this->name, op_errno, + "file %s (gfid:%s) was present " + "(hashed-subvol=%s, " + "cached-subvol=%s) before rename," + " but lookup failed", + local->loc2.path, + uuid_utoa(local->loc2.inode->gfid), + local->dst_hashed->name, local->dst_cached->name); + if (dht_inode_missing(op_errno)) + local->dst_cached = NULL; + } + } else if (is_src && xattr && + check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name)) { + local->is_linkfile = _gf_true; + /* Found linkto file instead of data file, passdown ENOENT + * based on the above comment */ + local->op_errno = ENOENT; + } + + if (!local->is_linkfile && (op_ret >= 0) && + gf_uuid_compare(loc->gfid, stbuf->ia_gfid)) { + gf_uuid_unparse(loc->gfid, gfid_local); + gf_uuid_unparse(stbuf->ia_gfid, gfid_server); + + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH, + "path:%s, received a different gfid, local_gfid= %s" + " server_gfid: %s", + local->loc.path, gfid_local, gfid_server); + + /* Will passdown ENOENT anyway since the file we sent on + * rename is replaced with a different file */ + local->op_errno = ENOENT; + /* Since local->is_linkfile is used here to detect failure, + * marking this to true */ + local->is_linkfile = _gf_true; + } + + call_cnt = dht_frame_return(frame); + if (is_last_call(call_cnt)) { + if (local->is_linkfile) { + local->op_ret = -1; + goto fail; } - return 0; -} + dht_rename_create_links(frame); + } + return 0; +fail: + dht_rename_unlock(frame, this); + return 0; +} int -dht_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) -{ - xlator_t *src_cached = NULL; - xlator_t *src_hashed = NULL; - xlator_t *dst_cached = NULL; - xlator_t *dst_hashed = NULL; - int op_errno = -1; - int ret = -1; - dht_local_t *local = NULL; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (oldloc, err); - VALIDATE_OR_GOTO (newloc, err); - - src_hashed = dht_subvol_get_hashed (this, oldloc); - if (!src_hashed) { - gf_log (this->name, GF_LOG_INFO, - "no subvolume in layout for path=%s", - oldloc->path); - op_errno = EINVAL; - goto err; - } +dht_rename_file_lock1_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + int ret = 0; + loc_t *loc = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + if (op_ret < 0) { + uuid_utoa_r(local->loc.inode->gfid, src_gfid); + + if (local->loc2.inode) + uuid_utoa_r(local->loc2.inode->gfid, dst_gfid); + + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR, + "protecting namespace of %s failed" + "rename (%s:%s:%s %s:%s:%s)", + local->current == &local->lock[0] ? local->loc.path + : local->loc2.path, + local->loc.path, src_gfid, local->src_hashed->name, + local->loc2.path, dst_gfid, + local->dst_hashed ? local->dst_hashed->name : NULL); + + local->op_ret = -1; + local->op_errno = op_errno; + goto err; + } + + if (local->current == &local->lock[0]) { + loc = &local->loc2; + subvol = local->dst_hashed; + local->current = &local->lock[1]; + } else { + loc = &local->loc; + subvol = local->src_hashed; + local->current = &local->lock[0]; + } + + ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns, + dht_rename_lock_cbk); + if (ret < 0) { + op_errno = EINVAL; + goto err; + } + + return 0; +err: + /* No harm in calling an extra unlock */ + dht_rename_unlock(frame, this); + return 0; +} - src_cached = dht_subvol_get_cached (this, oldloc->inode); - if (!src_cached) { - gf_log (this->name, GF_LOG_INFO, - "no cached subvolume for path=%s", oldloc->path); - op_errno = EINVAL; - goto err; - } +int32_t +dht_rename_file_protect_namespace(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + int ret = 0; + loc_t *loc = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + if (op_ret < 0) { + uuid_utoa_r(local->loc.inode->gfid, src_gfid); + + if (local->loc2.inode) + uuid_utoa_r(local->loc2.inode->gfid, dst_gfid); + + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR, + "acquiring inodelk failed " + "rename (%s:%s:%s %s:%s:%s)", + local->loc.path, src_gfid, local->src_cached->name, + local->loc2.path, dst_gfid, + local->dst_cached ? local->dst_cached->name : NULL); + + local->op_ret = -1; + local->op_errno = op_errno; + + goto err; + } + + /* Locks on src and dst needs to ordered which otherwise might cause + * deadlocks when rename (src, dst) and rename (dst, src) is done from + * two different clients + */ + ret = dht_order_rename_lock(frame, &loc, &subvol); + if (ret) { + local->op_errno = ENOMEM; + goto err; + } + + ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns, + dht_rename_file_lock1_cbk); + if (ret < 0) { + op_errno = EINVAL; + goto err; + } + + return 0; - dst_hashed = dht_subvol_get_hashed (this, newloc); - if (!dst_hashed) { - gf_log (this->name, GF_LOG_INFO, - "no subvolume in layout for path=%s", - newloc->path); - op_errno = EINVAL; - goto err; - } +err: + /* Its fine to call unlock even when no locks are acquired, as we check + * for lock->locked before winding a unlock call. + */ + dht_rename_unlock(frame, this); - if (newloc->inode) - dst_cached = dht_subvol_get_cached (this, newloc->inode); + return 0; +} - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; +int32_t +dht_rename_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + dict_t *xattr_req = NULL; + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *subvol = NULL; + dht_lock_t *lock = NULL; + + local = frame->local; + conf = this->private; + + if (op_ret < 0) { + uuid_utoa_r(local->loc.inode->gfid, src_gfid); + + if (local->loc2.inode) + uuid_utoa_r(local->loc2.inode->gfid, dst_gfid); + + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR, + "protecting namespace of %s failed. " + "rename (%s:%s:%s %s:%s:%s)", + local->current == &local->lock[0] ? local->loc.path + : local->loc2.path, + local->loc.path, src_gfid, local->src_hashed->name, + local->loc2.path, dst_gfid, + local->dst_hashed ? local->dst_hashed->name : NULL); + + local->op_ret = -1; + local->op_errno = op_errno; + + goto done; + } + + xattr_req = dict_new(); + if (xattr_req == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto done; + } + + op_ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto done; + } + + /* dst_cached might've changed. This normally happens for two reasons: + * 1. rebalance migrated dst + * 2. Another parallel rename was done overwriting dst + * + * Doing a lookup on local->loc2 when dst exists, but is associated + * with a different gfid will result in an ESTALE error. So, do a fresh + * lookup with a new inode on dst-path and handle change of dst-cached + * in the cbk. Also, to identify dst-cached changes we do a lookup on + * "this" rather than the subvol. + */ + loc_copy(&local->loc2_copy, &local->loc2); + inode_unref(local->loc2_copy.inode); + local->loc2_copy.inode = inode_new(local->loc.inode->table); + + /* Why not use local->lock.locks[?].loc for lookup post lock phase + * --------------------------------------------------------------- + * "layout.parent_layout.locks[?].loc" does not have the name and pargfid + * populated. + * Reason: If we had populated the name and pargfid, server might + * resolve to a successful lookup even if there is a file with same name + * with a different gfid(unlink & create) as server does name based + * resolution on first priority. And this can result in operating on a + * different inode entirely. + * + * Now consider a scenario where source file was renamed by some other + * client to a new name just before this lock was granted. So if a + * lookup would be done on local->lock[0].layout.parent_layout.locks[?].loc, + * server will send success even if the entry was renamed (since server will + * do a gfid based resolution). So once a lock is granted, make sure the + * file exists with the name that the client requested with. + * */ + + local->call_cnt = 2; + for (i = 0; i < 2; i++) { + if (i == 0) { + lock = local->rename_inodelk_backward_compatible[0]; + if (gf_uuid_compare(local->loc.gfid, lock->loc.gfid) == 0) + subvol = lock->xl; + else { + lock = local->rename_inodelk_backward_compatible[1]; + subvol = lock->xl; + } + } else { + subvol = this; } - ret = loc_copy (&local->loc, oldloc); - if (ret == -1) { - op_errno = ENOMEM; - goto err; - } + STACK_WIND_COOKIE(frame, dht_rename_lookup_cbk, (void *)(long)i, subvol, + subvol->fops->lookup, + (i == 0) ? &local->loc : &local->loc2_copy, + xattr_req); + } - ret = loc_copy (&local->loc2, newloc); - if (ret == -1) { - op_errno = ENOMEM; - goto err; - } + dict_unref(xattr_req); + return 0; - local->src_hashed = src_hashed; - local->src_cached = src_cached; - local->dst_hashed = dst_hashed; - local->dst_cached = dst_cached; +done: + /* Its fine to call unlock even when no locks are acquired, as we check + * for lock->locked before winding a unlock call. + */ + dht_rename_unlock(frame, this); - gf_log (this->name, GF_LOG_TRACE, - "renaming %s (hash=%s/cache=%s) => %s (hash=%s/cache=%s)", - oldloc->path, src_hashed->name, src_cached->name, - newloc->path, dst_hashed->name, - dst_cached ? dst_cached->name : "<nul>"); + if (xattr_req) + dict_unref(xattr_req); - if (IA_ISDIR (oldloc->inode->ia_type)) { - dht_rename_dir (frame, this); - } else { - local->op_ret = 0; - dht_rename_create_links (frame); + return 0; +} + +int +dht_rename_lock(call_frame_t *frame) +{ + dht_local_t *local = NULL; + int count = 1, ret = -1; + dht_lock_t **lk_array = NULL; + + local = frame->local; + + if (local->dst_cached) + count++; + + lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer); + if (lk_array == NULL) + goto err; + + lk_array[0] = dht_lock_new(frame->this, local->src_cached, &local->loc, + F_WRLCK, DHT_FILE_MIGRATE_DOMAIN, NULL, + FAIL_ON_ANY_ERROR); + if (lk_array[0] == NULL) + goto err; + + if (local->dst_cached) { + /* dst might be removed by the time inodelk reaches bricks, + * which can result in ESTALE errors. POSIX imposes no + * restriction for dst to be present for renames to be + * successful. So, we'll ignore ESTALE errors. As far as + * synchronization on dst goes, we'll achieve the same by + * holding entrylk on parent directory of dst in the namespace + * of basename(dst). Also, there might not be quorum in cluster + * xlators like EC/disperse on errno, in which case they return + * EIO. For eg., in a disperse (4 + 2), 3 might return success + * and three might return ESTALE. Disperse, having no Quorum + * unwinds inodelk with EIO. So, ignore EIO too. + */ + lk_array[1] = dht_lock_new(frame->this, local->dst_cached, &local->loc2, + F_WRLCK, DHT_FILE_MIGRATE_DOMAIN, NULL, + IGNORE_ENOENT_ESTALE_EIO); + if (lk_array[1] == NULL) + goto err; + } + + local->rename_inodelk_backward_compatible = lk_array; + local->rename_inodelk_bc_count = count; + + /* retaining inodelks for the sake of backward compatibility. Please + * make sure to remove this inodelk once all of 3.10, 3.12 and 3.13 + * reach EOL. Better way of getting synchronization would be to acquire + * entrylks on src and dst parent directories in the namespace of + * basenames of src and dst + */ + ret = dht_blocking_inodelk(frame, lk_array, count, + dht_rename_file_protect_namespace); + if (ret < 0) { + local->rename_inodelk_backward_compatible = NULL; + local->rename_inodelk_bc_count = 0; + goto err; + } + + return 0; +err: + if (lk_array != NULL) { + int tmp_count = 0, i = 0; + + for (i = 0; (i < count) && (lk_array[i]); i++, tmp_count++) + ; + + dht_lock_array_free(lk_array, tmp_count); + GF_FREE(lk_array); + } + + return -1; +} + +int +dht_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + xlator_t *src_cached = NULL; + xlator_t *src_hashed = NULL; + xlator_t *dst_cached = NULL; + xlator_t *dst_hashed = NULL; + int op_errno = -1; + int ret = -1; + dht_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + char newgfid[GF_UUID_BUF_SIZE] = {0}; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(oldloc, err); + VALIDATE_OR_GOTO(newloc, err); + + gf_uuid_unparse(oldloc->inode->gfid, gfid); + + src_hashed = dht_subvol_get_hashed(this, oldloc); + if (!src_hashed) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED, + "No hashed subvolume in layout for path=%s," + "(gfid = %s)", + oldloc->path, gfid); + op_errno = EINVAL; + goto err; + } + + src_cached = dht_subvol_get_cached(this, oldloc->inode); + if (!src_cached) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED, + "No cached subvolume for path = %s," + "(gfid = %s)", + oldloc->path, gfid); + + op_errno = EINVAL; + goto err; + } + + dst_hashed = dht_subvol_get_hashed(this, newloc); + if (!dst_hashed) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED, + "No hashed subvolume in layout for path=%s", newloc->path); + op_errno = EINVAL; + goto err; + } + + if (newloc->inode) + dst_cached = dht_subvol_get_cached(this, newloc->inode); + + local = dht_local_init(frame, oldloc, NULL, GF_FOP_RENAME); + if (!local) { + op_errno = ENOMEM; + goto err; + } + /* cached_subvol will be set from dht_local_init, reset it to NULL, + as the logic of handling rename is different */ + local->cached_subvol = NULL; + + ret = loc_copy(&local->loc2, newloc); + if (ret == -1) { + op_errno = ENOMEM; + goto err; + } + + local->src_hashed = src_hashed; + local->src_cached = src_cached; + local->dst_hashed = dst_hashed; + local->dst_cached = dst_cached; + if (xdata) + local->xattr_req = dict_ref(xdata); + + if (newloc->inode) + gf_uuid_unparse(newloc->inode->gfid, newgfid); + + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_INFO, + "renaming %s (%s) (hash=%s/cache=%s) => %s (%s) " + "(hash=%s/cache=%s) ", + oldloc->path, gfid, src_hashed->name, src_cached->name, newloc->path, + newloc->inode ? newgfid : NULL, dst_hashed->name, + dst_cached ? dst_cached->name : "<nul>"); + + if (IA_ISDIR(oldloc->inode->ia_type)) { + dht_rename_dir(frame, this); + } else { + local->op_ret = 0; + ret = dht_rename_lock(frame); + if (ret < 0) { + op_errno = ENOMEM; + goto err; } + } - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); - return 0; + return 0; +} + +int +dht_pt_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + gf_boolean_t free_xdata = _gf_false; + + /* Just a pass through */ + if (!IA_ISDIR(oldloc->inode->ia_type)) { + if (!xdata) { + free_xdata = _gf_true; + } + DHT_CHANGELOG_TRACK_AS_RENAME(xdata, oldloc, newloc); + } + default_rename(frame, this, oldloc, newloc, xdata); + if (free_xdata && xdata) { + dict_unref(xdata); + xdata = NULL; + } + return 0; } diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index ddd043dc8a4..3e24065227c 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -1,617 +1,2600 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. +#include "dht-lock.h" + +#define DHT_SET_LAYOUT_RANGE(layout, i, srt, chunk, path) \ + do { \ + layout->list[i].start = srt; \ + layout->list[i].stop = srt + chunk - 1; \ + layout->list[i].commit_hash = layout->commit_hash; \ + \ + gf_msg_trace(this->name, 0, \ + "gave fix: 0x%x - 0x%x, with commit-hash 0x%x" \ + " on %s for %s", \ + layout->list[i].start, layout->list[i].stop, \ + layout->list[i].commit_hash, \ + layout->list[i].xlator->name, path); \ + } while (0) + +#define DHT_RESET_LAYOUT_RANGE(layout) \ + do { \ + int cnt = 0; \ + for (cnt = 0; cnt < layout->cnt; cnt++) { \ + layout->list[cnt].start = 0; \ + layout->list[cnt].stop = 0; \ + } \ + } while (0) + +static int +dht_selfheal_layout_lock(call_frame_t *frame, dht_layout_t *layout, + gf_boolean_t newdir, dht_selfheal_layout_t healer, + dht_need_heal_t should_heal); + +static uint32_t +dht_overlap_calc(dht_layout_t *old, int o, dht_layout_t *new, int n) +{ + if (o >= old->cnt || n >= new->cnt) + return 0; - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + if (old->list[o].err > 0 || new->list[n].err > 0) + return 0; -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif + if (old->list[o].start == old->list[o].stop) { + return 0; + } + if (new->list[n].start == new->list[n].stop) { + return 0; + } -#include "glusterfs.h" -#include "xlator.h" -#include "dht-common.h" + if ((old->list[o].start > new->list[n].stop) || + (old->list[o].stop < new->list[n].start)) + return 0; + return min(old->list[o].stop, new->list[n].stop) - + max(old->list[o].start, new->list[n].start) + 1; +} int -dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret) +dht_selfheal_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - - local = frame->local; - local->selfheal.dir_cbk (frame, NULL, frame->this, ret, - local->op_errno); + DHT_STACK_DESTROY(frame); + return 0; +} - return 0; +int +dht_selfheal_dir_finish(call_frame_t *frame, xlator_t *this, int ret, + int invoke_cbk) +{ + dht_local_t *local = NULL, *lock_local = NULL; + call_frame_t *lock_frame = NULL; + int lock_count = 0; + + local = frame->local; + + /* Unlock entrylk */ + dht_unlock_entrylk_wrapper(frame, &local->lock[0].ns.directory_ns); + + /* Unlock inodelk */ + lock_count = dht_lock_count(local->lock[0].ns.parent_layout.locks, + local->lock[0].ns.parent_layout.lk_count); + if (lock_count == 0) + goto done; + + lock_frame = copy_frame(frame); + if (lock_frame == NULL) { + goto done; + } + + lock_local = dht_local_init(lock_frame, &local->loc, NULL, + lock_frame->root->op); + if (lock_local == NULL) { + goto done; + } + + lock_local->lock[0].ns.parent_layout.locks = local->lock[0] + .ns.parent_layout.locks; + lock_local->lock[0] + .ns.parent_layout.lk_count = local->lock[0].ns.parent_layout.lk_count; + + local->lock[0].ns.parent_layout.locks = NULL; + local->lock[0].ns.parent_layout.lk_count = 0; + + dht_unlock_inodelk(lock_frame, lock_local->lock[0].ns.parent_layout.locks, + lock_local->lock[0].ns.parent_layout.lk_count, + dht_selfheal_unlock_cbk); + lock_frame = NULL; + +done: + if (invoke_cbk) + local->selfheal.dir_cbk(frame, NULL, frame->this, ret, local->op_errno, + NULL); + if (lock_frame != NULL) { + DHT_STACK_DESTROY(lock_frame); + } + + return 0; } +int +dht_refresh_layout_done(call_frame_t *frame) +{ + int ret = -1; + dht_layout_t *refreshed = NULL, *heal = NULL; + dht_local_t *local = NULL; + dht_need_heal_t should_heal = NULL; + dht_selfheal_layout_t healer = NULL; + + local = frame->local; + + refreshed = local->selfheal.refreshed_layout; + heal = local->selfheal.layout; + + healer = local->selfheal.healer; + should_heal = local->selfheal.should_heal; + + ret = dht_layout_sort(refreshed); + if (ret == -1) { + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + DHT_MSG_LAYOUT_SORT_FAILED, NULL); + goto err; + } + + if (should_heal(frame, &heal, &refreshed)) { + healer(frame, &local->loc, heal); + } else { + local->selfheal.layout = NULL; + local->selfheal.refreshed_layout = NULL; + local->selfheal.layout = refreshed; + + dht_layout_unref(frame->this, heal); + + dht_selfheal_dir_finish(frame, frame->this, 0, 1); + } + + return 0; + +err: + dht_selfheal_dir_finish(frame, frame->this, -1, 1); + return 0; +} int -dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) -{ - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - xlator_t *subvol = NULL; - int i = 0; - dht_layout_t *layout = NULL; - int err = 0; - int this_call_cnt = 0; - - local = frame->local; - layout = local->selfheal.layout; - prev = cookie; - subvol = prev->this; - - if (op_ret == 0) - err = 0; - else - err = op_errno; +dht_refresh_layout_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + dht_layout_t *layout = NULL; + char gfid[GF_UUID_BUF_SIZE] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, err); + GF_VALIDATE_OR_GOTO("dht", frame->local, err); + GF_VALIDATE_OR_GOTO("dht", this->private, err); + + local = frame->local; + prev = cookie; + + layout = local->selfheal.refreshed_layout; + + LOCK(&frame->lock); + { + op_ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr); + + dht_iatt_merge(this, &local->stbuf, stbuf); + + if (op_ret == -1) { + gf_uuid_unparse(local->loc.gfid, gfid); + local->op_errno = op_errno; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_FILE_LOOKUP_FAILED, "path=%s", local->loc.path, + "name=%s", prev->name, "gfid=%s", gfid, NULL); + + goto unlock; + } - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].xlator == subvol) { - layout->list[i].err = err; - break; - } + local->op_ret = 0; + } +unlock: + UNLOCK(&frame->lock); + + this_call_cnt = dht_frame_return(frame); + + if (is_last_call(this_call_cnt)) { + if (local->op_ret == 0) { + local->refresh_layout_done(frame); + } else { + goto err; } + } + + return 0; - this_call_cnt = dht_frame_return (frame); +err: + if (local) { + local->refresh_layout_unlock(frame, this, -1, 1); + } + return 0; +} - if (is_last_call (this_call_cnt)) { - dht_selfheal_dir_finish (frame, this, 0); +int +dht_refresh_layout(call_frame_t *frame) +{ + int call_cnt = 0; + int i = 0, ret = -1; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + xlator_t *this = NULL; + char gfid[GF_UUID_BUF_SIZE] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + + this = frame->this; + conf = this->private; + local = frame->local; + + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + local->op_ret = -1; + + if (local->selfheal.refreshed_layout) { + dht_layout_unref(this, local->selfheal.refreshed_layout); + local->selfheal.refreshed_layout = NULL; + } + + local->selfheal.refreshed_layout = dht_layout_new(this, + conf->subvolume_cnt); + if (!local->selfheal.refreshed_layout) { + gf_uuid_unparse(local->loc.gfid, gfid); + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED, + "path=%s", local->loc.path, "gfid=%s", gfid, NULL); + goto out; + } + + if (local->xattr != NULL) { + dict_del(local->xattr, conf->xattr_name); + } + + if (local->xattr_req == NULL) { + gf_uuid_unparse(local->loc.gfid, gfid); + local->xattr_req = dict_new(); + if (local->xattr_req == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "path=%s", local->loc.path, "gfid=%s", gfid, NULL); + goto out; } + } - return 0; + if (dict_get(local->xattr_req, conf->xattr_name) == 0) { + ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", local->loc.path, "key=%s", conf->xattr_name, + NULL); + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_refresh_layout_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, + local->xattr_req); + } + + return 0; + +out: + if (local) { + local->refresh_layout_unlock(frame, this, -1, 1); + } + return 0; } +int32_t +dht_selfheal_layout_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + + local = frame->local; + + if (!local) { + goto err; + } + + if (op_ret < 0) { + local->op_errno = op_errno; + goto err; + } + + local->refresh_layout_unlock = dht_selfheal_dir_finish; + local->refresh_layout_done = dht_refresh_layout_done; + + dht_refresh_layout(frame); + return 0; + +err: + dht_selfheal_dir_finish(frame, this, -1, 1); + return 0; +} + +gf_boolean_t +dht_should_heal_layout(call_frame_t *frame, dht_layout_t **heal, + dht_layout_t **ondisk) +{ + gf_boolean_t fixit = _gf_true; + dht_local_t *local = NULL; + int heal_missing_dirs = 0; + + local = frame->local; + + if ((heal == NULL) || (*heal == NULL) || (ondisk == NULL) || + (*ondisk == NULL)) + goto out; + + dht_layout_anomalies( + frame->this, &local->loc, *ondisk, &local->selfheal.hole_cnt, + &local->selfheal.overlaps_cnt, &local->selfheal.missing_cnt, + &local->selfheal.down, &local->selfheal.misc, NULL); + + /* Directories might've been created as part of this self-heal. We've to + * sync non-layout xattrs and set range 0-0 on new directories + */ + heal_missing_dirs = local->selfheal.force_mkdir + ? local->selfheal.force_mkdir + : dht_layout_missing_dirs(*heal); + + if ((local->selfheal.hole_cnt == 0) && + (local->selfheal.overlaps_cnt == 0) && heal_missing_dirs) { + dht_layout_t *tmp = NULL; + + /* Just added a brick and need to set 0-0 range on this brick. + * But ondisk layout is well-formed. So, swap layouts "heal" and + * "ondisk". Now "ondisk" layout will be used for healing + * xattrs. If there are any non-participating subvols in + * "ondisk" layout, dht_selfheal_dir_xattr_persubvol will set + * 0-0 and non-layout xattrs. This way we won't end up in + * "corrupting" already set and well-formed "ondisk" layout. + */ + tmp = *heal; + *heal = *ondisk; + *ondisk = tmp; + + /* Current selfheal code, heals non-layout xattrs only after + * an add-brick. In fact non-layout xattrs are considered as + * secondary citizens which are healed only if layout xattrs + * need to be healed. This is wrong, since for eg., quota can be + * set when layout is well-formed, but a node is down. Also, + * just for healing non-layout xattrs, we don't need locking. + * This issue is _NOT FIXED_ by this patch. + */ + } + + fixit = (local->selfheal.hole_cnt || local->selfheal.overlaps_cnt || + heal_missing_dirs); + +out: + return fixit; +} int -dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout, int i) +dht_layout_span(dht_layout_t *layout) { - xlator_t *subvol = NULL; - dict_t *xattr = NULL; - int ret = 0; - xlator_t *this = NULL; - int32_t *disk_layout = NULL; + int i = 0, count = 0; + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err) + continue; - subvol = layout->list[i].xlator; - this = frame->this; + if (layout->list[i].start != layout->list[i].stop) + count++; + } - xattr = get_new_dict (); - if (!xattr) { - goto err; + return count; +} + +int +dht_decommissioned_bricks_in_layout(xlator_t *this, dht_layout_t *layout) +{ + dht_conf_t *conf = NULL; + int count = 0, i = 0, j = 0; + + if ((this == NULL) || (layout == NULL)) + goto out; + + conf = this->private; + + for (i = 0; i < layout->cnt; i++) { + for (j = 0; j < conf->subvolume_cnt; j++) { + if (conf->decommissioned_bricks[j] && + conf->decommissioned_bricks[j] == layout->list[i].xlator) { + count++; + } } + } - ret = dht_disk_layout_extract (this, layout, i, &disk_layout); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "%s: (subvol %s) failed to extract disk layout", - loc->path, subvol->name); - goto err; +out: + return count; +} + +dht_distribution_type_t +dht_distribution_type(xlator_t *this, dht_layout_t *layout) +{ + dht_distribution_type_t type = GF_DHT_EQUAL_DISTRIBUTION; + int i = 0; + uint32_t start_range = 0, range = 0, diff = 0; + + if ((this == NULL) || (layout == NULL) || (layout->cnt < 1)) { + goto out; + } + + for (i = 0; i < layout->cnt; i++) { + if (start_range == 0) { + start_range = layout->list[i].stop - layout->list[i].start; + continue; } - ret = dict_set_bin (xattr, "trusted.glusterfs.dht", - disk_layout, 4 * 4); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "%s: (subvol %s) failed to set xattr dictionary", - loc->path, subvol->name); - goto err; + range = layout->list[i].stop - layout->list[i].start; + diff = (range >= start_range) ? range - start_range + : start_range - range; + + if ((range != 0) && (diff > layout->cnt)) { + type = GF_DHT_WEIGHTED_DISTRIBUTION; + break; } - disk_layout = NULL; + } - gf_log (this->name, GF_LOG_TRACE, - "setting hash range %u - %u (type %d) on subvolume %s for %s", - layout->list[i].start, layout->list[i].stop, - layout->type, subvol->name, loc->path); +out: + return type; +} - dict_ref (xattr); +gf_boolean_t +dht_should_fix_layout(call_frame_t *frame, dht_layout_t **inmem, + dht_layout_t **ondisk) +{ + gf_boolean_t fixit = _gf_true; - STACK_WIND (frame, dht_selfheal_dir_xattr_cbk, - subvol, subvol->fops->setxattr, - loc, xattr, 0); + dht_local_t *local = NULL; + int layout_span = 0; + int decommissioned_bricks = 0; + dht_conf_t *conf = NULL; + dht_distribution_type_t inmem_dist_type = 0; + dht_distribution_type_t ondisk_dist_type = 0; - dict_unref (xattr); + conf = frame->this->private; - return 0; + local = frame->local; -err: - if (xattr) - dict_destroy (xattr); + if ((inmem == NULL) || (*inmem == NULL) || (ondisk == NULL) || + (*ondisk == NULL)) + goto out; - if (disk_layout) - GF_FREE (disk_layout); + dht_layout_anomalies(frame->this, &local->loc, *ondisk, + &local->selfheal.hole_cnt, + &local->selfheal.overlaps_cnt, NULL, + &local->selfheal.down, &local->selfheal.misc, NULL); - dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this, - -1, ENOMEM); - return 0; -} + if (local->selfheal.down || local->selfheal.misc) { + fixit = _gf_false; + goto out; + } + if (local->selfheal.hole_cnt || local->selfheal.overlaps_cnt) + goto out; -int -dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) + /* If commit hashes are being updated, let it through */ + if ((*inmem)->commit_hash != (*ondisk)->commit_hash) + goto out; + + layout_span = dht_layout_span(*ondisk); + + decommissioned_bricks = dht_decommissioned_bricks_in_layout(frame->this, + *ondisk); + inmem_dist_type = dht_distribution_type(frame->this, *inmem); + ondisk_dist_type = dht_distribution_type(frame->this, *ondisk); + + if ((decommissioned_bricks == 0) && + (layout_span == + (conf->subvolume_cnt - conf->decommission_subvols_cnt)) && + (inmem_dist_type == ondisk_dist_type)) + fixit = _gf_false; + +out: + + return fixit; +} + +static int +dht_selfheal_layout_lock(call_frame_t *frame, dht_layout_t *layout, + gf_boolean_t newdir, dht_selfheal_layout_t healer, + dht_need_heal_t should_heal) { - dht_local_t *local = NULL; - int missing_xattr = 0; - int i = 0; - xlator_t *this = NULL; + dht_local_t *local = NULL; + int count = 1, ret = -1, i = 0; + dht_lock_t **lk_array = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *tmp = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; - local = frame->local; - this = frame->this; + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err); - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err != -1 || !layout->list[i].stop) { - /* err != -1 would mean xattr present on the directory - * or the directory is itself non existant. - * !layout->list[i].stop would mean layout absent - */ + local = frame->local; - continue; - } - missing_xattr++; + conf = frame->this->private; + + local->selfheal.healer = healer; + local->selfheal.should_heal = should_heal; + + tmp = local->selfheal.layout; + local->selfheal.layout = dht_layout_ref(frame->this, layout); + dht_layout_unref(frame->this, tmp); + + if (!newdir) { + count = conf->subvolume_cnt; + + lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_char); + if (lk_array == NULL) { + gf_uuid_unparse(local->stbuf.ia_gfid, gfid); + gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED, + "lk_array-gfid=%s", gfid, "path=%s", local->loc.path, NULL); + goto err; } - gf_log (this->name, GF_LOG_TRACE, - "%d subvolumes missing xattr for %s", - missing_xattr, loc->path); + for (i = 0; i < count; i++) { + lk_array[i] = dht_lock_new( + frame->this, conf->subvolumes[i], &local->loc, F_WRLCK, + DHT_LAYOUT_HEAL_DOMAIN, NULL, FAIL_ON_ANY_ERROR); + if (lk_array[i] == NULL) { + gf_uuid_unparse(local->stbuf.ia_gfid, gfid); + gf_smsg(THIS->name, GF_LOG_ERROR, ENOMEM, + DHT_MSG_MEM_ALLOC_FAILED, "lk_array-gfid=%s", gfid, + "path=%s", local->loc.path, NULL); + goto err; + } + } + } else { + count = 1; + lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_char); + if (lk_array == NULL) { + gf_uuid_unparse(local->stbuf.ia_gfid, gfid); + gf_smsg(THIS->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED, + "lk_array-gfid=%s", gfid, "path=%s", local->loc.path, NULL); + goto err; + } - if (missing_xattr == 0) { - dht_selfheal_dir_finish (frame, this, 0); - return 0; + lk_array[0] = dht_lock_new(frame->this, local->hashed_subvol, + &local->loc, F_WRLCK, DHT_LAYOUT_HEAL_DOMAIN, + NULL, FAIL_ON_ANY_ERROR); + if (lk_array[0] == NULL) { + gf_uuid_unparse(local->stbuf.ia_gfid, gfid); + gf_smsg(THIS->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED, + "lk_array-gfid=%s", gfid, "path=%s", local->loc.path, NULL); + goto err; } + } - local->call_cnt = missing_xattr; + local->lock[0].layout.my_layout.locks = lk_array; + local->lock[0].layout.my_layout.lk_count = count; - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err != -1 || !layout->list[i].stop) - continue; + ret = dht_blocking_inodelk(frame, lk_array, count, + dht_selfheal_layout_lock_cbk); + if (ret < 0) { + local->lock[0].layout.my_layout.locks = NULL; + local->lock[0].layout.my_layout.lk_count = 0; + goto err; + } - dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i); + return 0; +err: + if (lk_array != NULL) { + dht_lock_array_free(lk_array, count); + GF_FREE(lk_array); + } + + return -1; +} - if (--missing_xattr == 0) - break; +static int +dht_selfheal_dir_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + struct iatt *stbuf = NULL; + int i = 0; + int ret = 0; + dht_layout_t *layout = NULL; + int err = 0; + int this_call_cnt = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + layout = local->selfheal.layout; + subvol = cookie; + + if (op_ret == 0) { + err = 0; + } else { + gf_uuid_unparse(local->loc.gfid, gfid); + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "name=%s", subvol->name, + "path=%s", local->loc.path, "gfid=%s", gfid, NULL); + err = op_errno; + } + + ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf); + if (ret < 0) { + gf_uuid_unparse(local->loc.gfid, gfid); + gf_msg_debug(this->name, 0, + "key = %s not present in dict" + ", path:%s gfid:%s", + DHT_IATT_IN_XDATA_KEY, local->loc.path, gfid); + } + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) { + layout->list[i].err = err; + break; } - return 0; + } + + LOCK(&frame->lock); + { + dht_iatt_merge(this, &local->stbuf, stbuf); + } + UNLOCK(&frame->lock); + + this_call_cnt = dht_frame_return(frame); + + if (is_last_call(this_call_cnt)) { + dht_selfheal_dir_finish(frame, this, 0, 1); + } + + return 0; } +/* Code is required to set user xattr to local->xattr + */ int -dht_selfheal_dir_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *statpre, - struct iatt *statpost) +dht_set_user_xattr(dict_t *dict, char *k, data_t *v, void *data) +{ + dict_t *set_xattr = data; + int ret = -1; + + ret = dict_set(set_xattr, k, v); + return ret; +} + +static int +dht_selfheal_dir_xattr_persubvol(call_frame_t *frame, loc_t *loc, + dht_layout_t *layout, int i, + xlator_t *req_subvol) { - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int this_call_cnt = 0; + xlator_t *subvol = NULL; + dict_t *xattr = NULL; + dict_t *xdata = NULL; + int ret = 0; + xlator_t *this = NULL; + int32_t *disk_layout = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + data_t *data = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + if (req_subvol) + subvol = req_subvol; + else + subvol = layout->list[i].xlator; + this = frame->this; + + GF_VALIDATE_OR_GOTO("", this, err); + GF_VALIDATE_OR_GOTO(this->name, layout, err); + GF_VALIDATE_OR_GOTO(this->name, local, err); + GF_VALIDATE_OR_GOTO(this->name, subvol, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + xattr = dict_new(); + if (!xattr) { + goto err; + } + + xdata = dict_new(); + if (!xdata) + goto err; + + ret = dict_set_str(xdata, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", loc->path, "key=%s", GLUSTERFS_INTERNAL_FOP_KEY, + "gfid=%s", gfid, NULL); + goto err; + } + + ret = dict_set_int8(xdata, DHT_IATT_IN_XDATA_KEY, 1); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", loc->path, "key=%s", DHT_IATT_IN_XDATA_KEY, + "gfid=%s", gfid, NULL); + goto err; + } + + gf_uuid_unparse(loc->inode->gfid, gfid); + + ret = dht_disk_layout_extract(this, layout, i, &disk_layout); + if (ret == -1) { + gf_smsg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, + "extract-disk-layout-failed, path=%s", loc->path, "subvol=%s", + subvol->name, "gfid=%s", gfid, NULL); + goto err; + } + + ret = dict_set_bin(xattr, conf->xattr_name, disk_layout, 4 * 4); + if (ret == -1) { + gf_smsg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "path=%s", loc->path, + "subvol=%s", subvol->name, + "set-xattr-dictionary-failed" + "gfid=%s", + gfid, NULL); + goto err; + } + disk_layout = NULL; + + gf_msg_trace(this->name, 0, + "setting hash range 0x%x - 0x%x (type %d) on subvolume %s" + " for %s", + layout->list[i].start, layout->list[i].stop, layout->type, + subvol->name, loc->path); + + if (local->xattr) { + data = dict_get(local->xattr, QUOTA_LIMIT_KEY); + if (data) { + ret = dict_add(xattr, QUOTA_LIMIT_KEY, data); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", loc->path, "key=%s", QUOTA_LIMIT_KEY, NULL); + } + } + data = dict_get(local->xattr, QUOTA_LIMIT_OBJECTS_KEY); + if (data) { + ret = dict_add(xattr, QUOTA_LIMIT_OBJECTS_KEY, data); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", loc->path, "key=%s", QUOTA_LIMIT_OBJECTS_KEY, + NULL); + } + } + } + + if (!gf_uuid_is_null(local->gfid)) + gf_uuid_copy(loc->gfid, local->gfid); + + STACK_WIND_COOKIE(frame, dht_selfheal_dir_xattr_cbk, (void *)subvol, subvol, + subvol->fops->setxattr, loc, xattr, 0, xdata); + + dict_unref(xattr); + dict_unref(xdata); + + return 0; - local = frame->local; - layout = local->selfheal.layout; +err: + if (xattr) + dict_unref(xattr); + if (xdata) + dict_unref(xdata); + + GF_FREE(disk_layout); - this_call_cnt = dht_frame_return (frame); + dht_selfheal_dir_xattr_cbk(frame, (void *)subvol, frame->this, -1, ENOMEM, + NULL); + return 0; +} - if (is_last_call (this_call_cnt)) { - dht_selfheal_dir_xattr (frame, &local->loc, layout); +static int +dht_fix_dir_xattr(call_frame_t *frame, loc_t *loc, dht_layout_t *layout) +{ + dht_local_t *local = NULL; + int i = 0; + int count = 0; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *dummy = NULL; + + local = frame->local; + this = frame->this; + conf = this->private; + + gf_msg_debug(this->name, 0, "%s: Writing the new range for all subvolumes", + loc->path); + + local->call_cnt = count = conf->subvolume_cnt; + + if (gf_log_get_loglevel() >= GF_LOG_DEBUG) + dht_log_new_layout_for_dir_selfheal(this, loc, layout); + + for (i = 0; i < layout->cnt; i++) { + dht_selfheal_dir_xattr_persubvol(frame, loc, layout, i, NULL); + + if (--count == 0) + goto out; + } + /* if we are here, subvolcount > layout_count. subvols-per-directory + * option might be set here. We need to clear out layout from the + * non-participating subvolumes, else it will result in overlaps */ + dummy = dht_layout_new(this, 1); + if (!dummy) + goto out; + dummy->commit_hash = layout->commit_hash; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (_gf_false == dht_is_subvol_in_layout(layout, conf->subvolumes[i])) { + dht_selfheal_dir_xattr_persubvol(frame, loc, dummy, 0, + conf->subvolumes[i]); + if (--count == 0) + break; } + } - return 0; + dht_layout_unref(this, dummy); +out: + return 0; } +static int +dht_selfheal_dir_xattr(call_frame_t *frame, loc_t *loc, dht_layout_t *layout) +{ + dht_local_t *local = NULL; + int missing_xattr = 0; + int i = 0; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *dummy = NULL; + char gfid[GF_UUID_BUF_SIZE] = { + 0, + }; + + local = frame->local; + this = frame->this; + conf = this->private; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err != -1 || !layout->list[i].stop) { + /* err != -1 would mean xattr present on the directory + * or the directory is non existent. + * !layout->list[i].stop would mean layout absent + */ + + continue; + } + missing_xattr++; + } + /* Also account for subvolumes with no-layout. Used for zero'ing out + * the layouts and for setting quota key's if present */ + for (i = 0; i < conf->subvolume_cnt; i++) { + if (_gf_false == dht_is_subvol_in_layout(layout, conf->subvolumes[i])) { + missing_xattr++; + } + } + gf_msg_trace(this->name, 0, "%d subvolumes missing xattr for %s", + missing_xattr, loc->path); + + if (missing_xattr == 0) { + dht_selfheal_dir_finish(frame, this, 0, 1); + return 0; + } + + local->call_cnt = missing_xattr; + + if (gf_log_get_loglevel() >= GF_LOG_DEBUG) + dht_log_new_layout_for_dir_selfheal(this, loc, layout); + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err != -1 || !layout->list[i].stop) + continue; + + dht_selfheal_dir_xattr_persubvol(frame, loc, layout, i, NULL); + + if (--missing_xattr == 0) + break; + } + dummy = dht_layout_new(this, 1); + if (!dummy) { + gf_uuid_unparse(loc->gfid, gfid); + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DUMMY_ALLOC_FAILED, + "path=%s", loc->path, "gfid=%s", gfid, NULL); + goto out; + } + for (i = 0; i < conf->subvolume_cnt && missing_xattr; i++) { + if (_gf_false == dht_is_subvol_in_layout(layout, conf->subvolumes[i])) { + dht_selfheal_dir_xattr_persubvol(frame, loc, dummy, 0, + conf->subvolumes[i]); + missing_xattr--; + } + } + + dht_layout_unref(this, dummy); +out: + return 0; +} int -dht_selfheal_dir_setattr (call_frame_t *frame, loc_t *loc, struct iatt *stbuf, - int32_t valid, dht_layout_t *layout) +dht_selfheal_dir_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) { - int missing_attr = 0; - int i = 0; - dht_local_t *local = NULL; - xlator_t *this = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int this_call_cnt = 0, ret = -1; - local = frame->local; - this = frame->this; + local = frame->local; + layout = local->selfheal.layout; - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err == -1) - missing_attr++; - } + this_call_cnt = dht_frame_return(frame); - if (missing_attr == 0) { - dht_selfheal_dir_xattr (frame, loc, layout); - return 0; + if (is_last_call(this_call_cnt)) { + if (!local->heal_layout) { + gf_msg_trace(this->name, 0, "Skip heal layout for %s gfid = %s ", + local->loc.path, uuid_utoa(local->gfid)); + + dht_selfheal_dir_finish(frame, this, 0, 1); + return 0; } + ret = dht_selfheal_layout_lock(frame, layout, _gf_false, + dht_selfheal_dir_xattr, + dht_should_heal_layout); - local->call_cnt = missing_attr; - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err == -1) { - gf_log (this->name, GF_LOG_TRACE, - "setattr for %s on subvol %s", - loc->path, layout->list[i].xlator->name); - - STACK_WIND (frame, dht_selfheal_dir_setattr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->setattr, - loc, stbuf, valid); - } + if (ret < 0) { + dht_selfheal_dir_finish(frame, this, -1, 1); } + } - return 0; + return 0; } int -dht_selfheal_dir_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) -{ - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - call_frame_t *prev = NULL; - xlator_t *subvol = NULL; - int i = 0; - int this_call_cnt = 0; - - - local = frame->local; - layout = local->selfheal.layout; - prev = cookie; - subvol = prev->this; - - if ((op_ret == 0) || ((op_ret == -1) && (op_errno == EEXIST))) { - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].xlator == subvol) { - layout->list[i].err = -1; - break; - } - } +dht_selfheal_dir_setattr(call_frame_t *frame, loc_t *loc, struct iatt *stbuf, + int32_t valid, dht_layout_t *layout) +{ + int missing_attr = 0; + int i = 0, ret = -1; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *this = NULL; + int cnt = 0; + + local = frame->local; + this = frame->this; + conf = this->private; + + /* We need to heal the attrs if: + * 1. Any directories were missing - the newly created dirs will need + * to have the correct attrs set + * 2. An existing dir does not have the correct permissions -they may + * have been changed when a brick was down. + */ + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == -1) + missing_attr++; + } + + if ((missing_attr == 0) && (local->need_attrheal == 0)) { + if (!local->heal_layout) { + gf_msg_trace(this->name, 0, "Skip heal layout for %s gfid = %s ", + loc->path, uuid_utoa(loc->gfid)); + dht_selfheal_dir_finish(frame, this, 0, 1); + return 0; } + ret = dht_selfheal_layout_lock(frame, layout, _gf_false, + dht_selfheal_dir_xattr, + dht_should_heal_layout); - if (op_ret) { - gf_log (this->name, ((op_errno == EEXIST) ? GF_LOG_DEBUG : - GF_LOG_WARNING), - "selfhealing directory %s failed: %s", - local->loc.path, strerror (op_errno)); - goto out; + if (ret < 0) { + dht_selfheal_dir_finish(frame, this, -1, 1); } - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - if (prev->this == local->hashed_subvol) - local->ia_ino = local->stbuf.ia_ino; + return 0; + } + + cnt = local->call_cnt = conf->subvolume_cnt; - dht_iatt_merge (this, &local->preparent, preparent, prev->this); - dht_iatt_merge (this, &local->postparent, postparent, prev->this); + for (i = 0; i < cnt; i++) { + STACK_WIND(frame, dht_selfheal_dir_setattr_cbk, layout->list[i].xlator, + layout->list[i].xlator->fops->setattr, loc, stbuf, valid, + NULL); + } -out: - this_call_cnt = dht_frame_return (frame); + return 0; +} - if (is_last_call (this_call_cnt)) { - dht_selfheal_dir_setattr (frame, &local->loc, &local->stbuf, 0xffffff, layout); +static int +dht_selfheal_dir_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + xlator_t *prev = NULL; + xlator_t *subvol = NULL; + int i = 0, ret = -1; + int this_call_cnt = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + layout = local->selfheal.layout; + prev = cookie; + subvol = prev; + + if ((op_ret == 0) || ((op_ret == -1) && (op_errno == EEXIST))) { + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) { + layout->list[i].err = -1; + break; + } } + } + + if (op_ret) { + gf_uuid_unparse(local->loc.gfid, gfid); + gf_smsg(this->name, + ((op_errno == EEXIST) ? GF_LOG_DEBUG : GF_LOG_WARNING), + op_errno, DHT_MSG_DIR_SELFHEAL_FAILED, "path=%s", + local->loc.path, "gfid=%s", gfid, NULL); + goto out; + } + dht_iatt_merge(this, &local->preparent, preparent); + dht_iatt_merge(this, &local->postparent, postparent); + ret = 0; - return 0; -} +out: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + dht_selfheal_dir_finish(frame, this, ret, 0); + dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf, 0xffffff, + layout); + } -int -dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout, int force) + return 0; +} + +static int +dht_selfheal_dir_mkdir_lookup_done(call_frame_t *frame, xlator_t *this) { - int missing_dirs = 0; - int i = 0; - int ret = -1; - dht_local_t *local = NULL; - xlator_t *this = NULL; - dict_t *dict = NULL; + dht_local_t *local = NULL; + int i = 0; + dict_t *dict = NULL; + dht_layout_t *layout = NULL; + loc_t *loc = NULL; + int cnt = 0; + int ret = -1; + + VALIDATE_OR_GOTO(this->private, err); + + local = frame->local; + layout = local->layout; + loc = &local->loc; + + if (!gf_uuid_is_null(local->gfid)) { + dict = dict_new(); + if (!dict) + return -1; + + ret = dict_set_gfuuid(dict, "gfid-req", local->gfid, true); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", loc->path, "key=gfid-req", NULL); + } else if (local->params) { + /* Send the dictionary from higher layers directly */ + + dict = dict_ref(local->params); + } + /* Code to update all extended attributed from local->xattr + to dict + */ + dht_dir_set_heal_xattr(this, local, dict, local->xattr, NULL, NULL); + + if (!dict) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_IS_NULL, NULL); + dict = dict_new(); + if (!dict) + return -1; + } + ret = dict_set_flag(dict, GF_INTERNAL_CTX_KEY, GF_DHT_HEAL_DIR); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, "key=%s", + GF_INTERNAL_CTX_KEY, "path=%s", loc->path, NULL); + /* We can still continue. As heal can still happen + * unless quota limits have reached for the dir. + */ + } + + cnt = layout->cnt; + for (i = 0; i < cnt; i++) { + if (layout->list[i].err == ESTALE || layout->list[i].err == ENOENT || + local->selfheal.force_mkdir) { + gf_msg_debug(this->name, 0, "Creating directory %s on subvol %s", + loc->path, layout->list[i].xlator->name); + + STACK_WIND_COOKIE( + frame, dht_selfheal_dir_mkdir_cbk, layout->list[i].xlator, + layout->list[i].xlator, layout->list[i].xlator->fops->mkdir, + loc, + st_mode_from_ia(local->stbuf.ia_prot, local->stbuf.ia_type), 0, + dict); + } + } - local = frame->local; - this = frame->this; + if (dict) + dict_unref(dict); - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err == ENOENT || force) - missing_dirs++; + return 0; + +err: + dht_selfheal_dir_finish(frame, this, -1, 1); + return 0; +} + +static int +dht_selfheal_dir_mkdir_lookup_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent) +{ + dht_local_t *local = NULL; + int i = 0; + int this_call_cnt = 0; + int missing_dirs = 0; + dht_layout_t *layout = NULL; + xlator_t *prev = 0; + loc_t *loc = NULL; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + int index = -1; + + VALIDATE_OR_GOTO(this->private, err); + + local = frame->local; + layout = local->layout; + loc = &local->loc; + prev = cookie; + + if (!gf_uuid_is_null(local->gfid)) + gf_uuid_unparse(local->gfid, gfid_local); + + LOCK(&frame->lock); + { + index = dht_layout_index_for_subvol(layout, prev); + if ((op_ret < 0) && (op_errno == ENOENT || op_errno == ESTALE)) { + local->selfheal.hole_cnt = !local->selfheal.hole_cnt + ? 1 + : local->selfheal.hole_cnt + 1; + /* the status might have changed. Update the layout with the + * new status + */ + if (index >= 0) { + layout->list[index].err = op_errno; + } } - if (missing_dirs == 0) { - dht_selfheal_dir_setattr (frame, loc, &local->stbuf, 0xffffffff, layout); + if (!op_ret) { + dht_iatt_merge(this, &local->stbuf, stbuf); + if (prev == local->mds_subvol) { + dict_unref(local->xattr); + local->xattr = dict_ref(xattr); + } + /* the status might have changed. Update the layout with the + * new status + */ + if (index >= 0) { + layout->list[index].err = -1; + } + } + } + UNLOCK(&frame->lock); + + this_call_cnt = dht_frame_return(frame); + + if (is_last_call(this_call_cnt)) { + if (local->selfheal.hole_cnt == layout->cnt) { + gf_msg_debug(this->name, op_errno, + "Lookup failed, an rmdir could have " + "deleted this entry %s", + loc->name); + local->op_errno = op_errno; + goto err; + } else { + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == ENOENT || + layout->list[i].err == ESTALE || + local->selfheal.force_mkdir) + missing_dirs++; + } + + if (missing_dirs == 0) { + dht_selfheal_dir_finish(frame, this, 0, 0); + dht_selfheal_dir_setattr(frame, loc, &local->stbuf, 0xffffffff, + layout); return 0; + } + + local->call_cnt = missing_dirs; + dht_selfheal_dir_mkdir_lookup_done(frame, this); } + } - local->call_cnt = missing_dirs; - if (!uuid_is_null (local->gfid)) { - dict = dict_new (); - if (!dict) - return -1; + return 0; - ret = dict_set_static_bin (dict, "gfid-req", local->gfid, 16); - if (ret) - gf_log (this->name, GF_LOG_INFO, - "%s: failed to set gfid in dict", loc->path); - } else if (local->params) { - /* Send the dictionary from higher layers directly */ - dict = dict_ref (local->params); +err: + dht_selfheal_dir_finish(frame, this, -1, 1); + return 0; +} + +static int +dht_selfheal_dir_mkdir_lock_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int i = 0; + int ret = -1; + xlator_t *mds_subvol = NULL; + + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + local = frame->local; + mds_subvol = local->mds_subvol; + + local->call_cnt = conf->subvolume_cnt; + + if (op_ret < 0) { + if (op_errno == EINVAL) { + local->call_cnt = 1; + dht_selfheal_dir_mkdir_lookup_done(frame, this); + return 0; } - if (!dict) - gf_log (this->name, GF_LOG_DEBUG, - "dict is NULL, need to make sure gfid's are same"); + gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_ENTRYLK_ERROR, + "path=%s", local->loc.path, NULL); + + local->op_errno = op_errno; + goto err; + } + + /* After getting locks, perform lookup again to ensure that the + directory was not deleted by a racing rmdir + */ + if (!local->xattr_req) + local->xattr_req = dict_new(); + + ret = dict_set_int32(local->xattr_req, "list-xattr", 1); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, "path=%s", + local->loc.path, NULL); + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (mds_subvol && conf->subvolumes[i] == mds_subvol) { + STACK_WIND_COOKIE(frame, dht_selfheal_dir_mkdir_lookup_cbk, + conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, + local->xattr_req); + } else { + STACK_WIND_COOKIE(frame, dht_selfheal_dir_mkdir_lookup_cbk, + conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, + NULL); + } + } - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err == ENOENT || force) { - gf_log (this->name, GF_LOG_DEBUG, - "creating directory %s on subvol %s", - loc->path, layout->list[i].xlator->name); - - STACK_WIND (frame, dht_selfheal_dir_mkdir_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->mkdir, - loc, - st_mode_from_ia (local->stbuf.ia_prot, - local->stbuf.ia_type), - dict); + return 0; + +err: + dht_selfheal_dir_finish(frame, this, -1, 1); + return 0; +} + +static int +dht_selfheal_dir_mkdir(call_frame_t *frame, loc_t *loc, dht_layout_t *layout, + int force) +{ + int missing_dirs = 0; + int i = 0; + int op_errno = 0; + int ret = -1; + dht_local_t *local = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + + local = frame->local; + this = frame->this; + conf = this->private; + + local->selfheal.force_mkdir = force; + local->selfheal.hole_cnt = 0; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == ENOENT || force) + missing_dirs++; + } + + if (missing_dirs == 0) { + /* We don't need to create any directories. Proceed to heal the + * attrs and xattrs + */ + if (!__is_root_gfid(local->stbuf.ia_gfid)) { + if (local->need_xattr_heal) { + local->need_xattr_heal = 0; + ret = dht_dir_xattr_heal(this, local, &op_errno); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s", + local->loc.path, "gfid=%s", local->gfid, NULL); } + } else { + if (!gf_uuid_is_null(local->gfid)) + gf_uuid_copy(loc->gfid, local->gfid); + + ret = dht_common_mark_mdsxattr(frame, NULL, 0); + if (!ret) + return 0; + + gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_SET_XATTR_FAILED, + "path=%s", local->loc.path, "gfid=%s", local->gfid, + NULL); + } + } + dht_selfheal_dir_setattr(frame, loc, &local->stbuf, 0xffffffff, layout); + return 0; + } + + /* MDS xattr is populated only while DHT is having more than one + subvol.In case of graph switch while adding more dht subvols need to + consider hash subvol as a MDS to avoid MDS check failure at the time + of running fop on directory + */ + if (!dict_get(local->xattr, conf->mds_xattr_key) && + (conf->subvolume_cnt > 1)) { + if (local->hashed_subvol == NULL) { + local->hashed_subvol = dht_subvol_get_hashed(this, loc); + if (local->hashed_subvol == NULL) { + local->op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, "gfid=%s", + loc->pargfid, "name=%s", loc->name, "path=%s", + loc->path, NULL); + goto err; + } } + ret = dht_inode_ctx_mdsvol_set(local->inode, this, + local->hashed_subvol); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "Failed to set hashed subvol for %s on inode vol is %s", + local->loc.path, + local->hashed_subvol ? local->hashed_subvol->name : "NULL"); + goto err; + } + } + + if (local->hashed_subvol == NULL) { + local->hashed_subvol = dht_subvol_get_hashed(this, loc); + if (local->hashed_subvol == NULL) { + local->op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, "gfid=%s", loc->pargfid, + "name=%s", loc->name, "path=%s", loc->path, NULL); + goto err; + } + } - if (dict) - dict_unref (dict); + local->current = &local->lock[0]; + ret = dht_protect_namespace(frame, loc, local->hashed_subvol, + &local->current->ns, + dht_selfheal_dir_mkdir_lock_cbk); - return 0; + if (ret < 0) + goto err; + + return 0; +err: + return -1; } +static int +dht_selfheal_layout_alloc_start(xlator_t *this, loc_t *loc, + dht_layout_t *layout) +{ + int start = 0; + uint32_t hashval = 0; + int ret = 0; + const char *str = NULL; + dht_conf_t *conf = NULL; + char buf[UUID_CANONICAL_FORM_LEN + 1] = { + 0, + }; + + conf = this->private; + + if (conf->randomize_by_gfid) { + str = uuid_utoa_r(loc->gfid, buf); + } else { + str = loc->path; + } + + ret = dht_hash_compute(this, layout->type, str, &hashval); + if (ret == 0) { + start = (hashval % layout->cnt); + } + + return start; +} -int -dht_selfheal_layout_alloc_start (xlator_t *this, loc_t *loc, - dht_layout_t *layout) +static int +dht_get_layout_count(xlator_t *this, dht_layout_t *layout, int new_layout) { - int start = 0; - uint32_t hashval = 0; - int ret = 0; + int i = 0; + int j = 0; + int err = 0; + int count = 0; + dht_conf_t *conf = NULL; + + /* Gets in use only for replace-brick, remove-brick */ + conf = this->private; + for (i = 0; i < layout->cnt; i++) { + for (j = 0; j < conf->subvolume_cnt; j++) { + if (conf->decommissioned_bricks[j] && + conf->decommissioned_bricks[j] == layout->list[i].xlator) { + layout->list[i].err = EINVAL; + break; + } + } + } + + for (i = 0; i < layout->cnt; i++) { + err = layout->list[i].err; + if (err == -1 || err == 0 || err == ENOENT) { + /* Take this with a pinch of salt. The behaviour seems + * to be slightly different when this function is + * invoked from mkdir codepath. For eg., err == 0 in + * mkdir codepath means directory created but xattr + * is not set yet. + */ + + /* Setting list[i].err = -1 is an indication for + dht_selfheal_layout_new_directory() to assign + a range. We set it to -1 based on any one of + the three criteria: + + - err == -1 already, which means directory + existed but layout was not set on it. + + - err == 0, which means directory exists and + has an old layout piece which will be + overwritten now. + + - err == ENOENT, which means directory does + not exist (possibly racing with mkdir or + finishing half done mkdir). The missing + directory will be attempted to be recreated. + */ + count++; + if (!err) + layout->list[i].err = -1; + } + } - ret = dht_hash_compute (layout->type, loc->path, &hashval); - if (ret == 0) { - start = (hashval % layout->cnt); + /* no subvolume has enough space, but can't stop directory creation */ + if (!count || !new_layout) { + for (i = 0; i < layout->cnt; i++) { + err = layout->list[i].err; + if (err == ENOSPC) { + layout->list[i].err = -1; + count++; + } } + } - return start; -} + /* if layout->spread_cnt is set, check if it is <= available + * subvolumes (down brick and decommissioned bricks are considered + * un-available). Else return count (available up bricks) */ + count = ((layout->spread_cnt && (layout->spread_cnt <= count)) + ? layout->spread_cnt + : ((count) ? count : 1)); + return count; +} void -dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout) -{ - xlator_t *this = NULL; - uint32_t chunk = 0; - int i = 0; - uint32_t start = 0; - int cnt = 0; - int err = 0; - int start_subvol = 0; +dht_selfheal_layout_new_directory(call_frame_t *frame, loc_t *loc, + dht_layout_t *new_layout); - this = frame->this; +void +dht_layout_range_swap(dht_layout_t *layout, int i, int j); - for (i = 0; i < layout->cnt; i++) { - err = layout->list[i].err; - if (err == -1 || err == 0) { - layout->list[i].err = -1; - cnt++; - } +/* + * It's a bit icky using local variables in a macro, but it makes the rest + * of the code a lot clearer. + */ +#define OV_ENTRY(x, y) table[x * new->cnt + y] + +static void +dht_selfheal_layout_maximize_overlap(call_frame_t *frame, loc_t *loc, + dht_layout_t *new, dht_layout_t *old) +{ + int i = 0; + int j = 0; + uint32_t curr_overlap = 0; + uint32_t max_overlap = 0; + int max_overlap_idx = -1; + uint32_t overlap = 0; + uint32_t *table = NULL; + + dht_layout_sort_volname(old); + /* Now both old_layout->list[] and new_layout->list[] + are match the same xlators/subvolumes. i.e, + old_layout->[i] and new_layout->[i] are referring + to the same subvolumes + */ + + /* Build a table of overlaps between new[i] and old[j]. */ + table = alloca(sizeof(overlap) * old->cnt * new->cnt); + if (!table) { + return; + } + memset(table, 0, sizeof(overlap) * old->cnt * new->cnt); + for (i = 0; i < new->cnt; ++i) { + for (j = 0; j < old->cnt; ++j) { + OV_ENTRY(i, j) = dht_overlap_calc(old, j, new, i); + } + } + + for (i = 0; i < new->cnt; i++) { + if (new->list[i].err > 0) { + /* Subvol might be marked for decommission + with EINVAL, or some other serious error + marked with positive errno. + */ + continue; } - /* no subvolume has enough space, but can't stop directory creation */ - if (!cnt) { - for (i = 0; i < layout->cnt; i++) { - err = layout->list[i].err; - if (err == ENOSPC) { - layout->list[i].err = -1; - cnt++; - } + max_overlap = 0; + max_overlap_idx = i; + for (j = (i + 1); j < new->cnt; ++j) { + if (new->list[j].err > 0) { + /* Subvol might be marked for decommission + with EINVAL, or some other serious error + marked with positive errno. + */ + continue; + } + /* Calculate the overlap now. */ + curr_overlap = OV_ENTRY(i, i) + OV_ENTRY(j, j); + /* Calculate the overlap after the proposed swap. */ + overlap = OV_ENTRY(i, j) + OV_ENTRY(j, i); + /* Are we better than status quo? */ + if (overlap > curr_overlap) { + overlap -= curr_overlap; + /* Are we better than the previous choice? */ + if (overlap > max_overlap) { + max_overlap = overlap; + max_overlap_idx = j; } + } } - chunk = ((unsigned long) 0xffffffff) / ((cnt) ? cnt : 1); + if (max_overlap_idx != i) { + dht_layout_range_swap(new, i, max_overlap_idx); + /* Need to swap the table values too. */ + for (j = 0; j < old->cnt; ++j) { + overlap = OV_ENTRY(i, j); + OV_ENTRY(i, j) = OV_ENTRY(max_overlap_idx, j); + OV_ENTRY(max_overlap_idx, j) = overlap; + } + } + } +} - start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout); +static dht_layout_t * +dht_fix_layout_of_directory(call_frame_t *frame, loc_t *loc, + dht_layout_t *layout) +{ + int i = 0; + xlator_t *this = NULL; + dht_layout_t *new_layout = NULL; + dht_conf_t *priv = NULL; + dht_local_t *local = NULL; + uint32_t subvol_down = 0; + gf_boolean_t maximize_overlap = _gf_true; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + this = frame->this; + priv = this->private; + local = frame->local; + + if (layout->type == DHT_HASH_TYPE_DM_USER) { + gf_msg_debug(THIS->name, 0, "leaving %s alone", loc->path); + goto done; + } + + new_layout = dht_layout_new(this, priv->subvolume_cnt); + if (!new_layout) { + gf_uuid_unparse(loc->gfid, gfid); + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED, + "new_layout, path=%s", loc->path, "gfid=%s", gfid, NULL); + goto done; + } + + /* If a subvolume is down, do not re-write the layout. */ + dht_layout_anomalies(this, loc, layout, NULL, NULL, NULL, &subvol_down, + NULL, NULL); + + if (subvol_down) { + gf_uuid_unparse(loc->gfid, gfid); + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_FIX_FAILED, + "subvol-down=%u", subvol_down, "Skipping-fix-layout", "path=%s", + loc->path, "gfid=%s", gfid, NULL); + GF_FREE(new_layout); + return NULL; + } + + for (i = 0; i < new_layout->cnt; i++) { + if (layout->list[i].err != ENOSPC) + new_layout->list[i].err = layout->list[i].err; + else + new_layout->list[i].err = -1; + + new_layout->list[i].xlator = layout->list[i].xlator; + } + + new_layout->commit_hash = layout->commit_hash; + + if (priv->du_stats) { + for (i = 0; i < priv->subvolume_cnt; ++i) { + gf_smsg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_SUBVOL_INFO, + "index=%d", i, "name=%s", priv->subvolumes[i]->name, + "chunks=%u", priv->du_stats[i].chunks, "path=%s", loc->path, + NULL); + + /* Maximize overlap if the bricks are all the same + * size. + * This is probably not going to be very common on + * live setups but will benefit our regression tests + */ + if (i && (priv->du_stats[i].chunks != priv->du_stats[0].chunks)) { + maximize_overlap = _gf_false; + } + } + } else { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_DISK_USAGE_STATUS, + NULL); + } + + /* First give it a layout as though it is a new directory. This + ensures rotation to kick in */ + dht_layout_sort_volname(new_layout); + dht_selfheal_layout_new_directory(frame, loc, new_layout); + + /* Maximize overlap if weighted-rebalance is disabled */ + if (!priv->do_weighting) + maximize_overlap = _gf_true; + + /* Now selectively re-assign ranges only when it helps */ + if (maximize_overlap) { + dht_selfheal_layout_maximize_overlap(frame, loc, new_layout, layout); + } +done: + if (new_layout) { + /* Make sure the extra 'ref' for existing layout is removed */ + dht_layout_unref(this, local->layout); + + local->layout = new_layout; + } + + return local->layout; +} - for (i = start_subvol; i < layout->cnt; i++) { - err = layout->list[i].err; - if (err == -1) { - layout->list[i].start = start; - layout->list[i].stop = start + chunk - 1; +/* + * Having to call this 2x for each entry in the layout is pretty horrible, but + * that's what all of this layout-sorting nonsense gets us. + */ +static uint32_t +dht_get_chunks_from_xl(xlator_t *parent, xlator_t *child) +{ + dht_conf_t *priv = parent->private; + xlator_list_t *trav; + uint32_t index = 0; - start = start + chunk; + if (!priv->du_stats) { + return 0; + } - gf_log (this->name, GF_LOG_TRACE, - "gave fix: %u - %u on %s for %s", - layout->list[i].start, layout->list[i].stop, - layout->list[i].xlator->name, loc->path); - if (--cnt == 0) { - layout->list[i].stop = 0xffffffff; - break; - } - } + for (trav = parent->children; trav; trav = trav->next) { + if (trav->xlator == child) { + return priv->du_stats[index].chunks; } + ++index; + } - for (i = 0; i < start_subvol; i++) { - err = layout->list[i].err; - if (err == -1) { - layout->list[i].start = start; - layout->list[i].stop = start + chunk - 1; + return 0; +} - start = start + chunk; +void +dht_selfheal_layout_new_directory(call_frame_t *frame, loc_t *loc, + dht_layout_t *layout) +{ + xlator_t *this = NULL; + double chunk = 0; + int i = 0; + uint32_t start = 0; + int bricks_to_use = 0; + int err = 0; + int start_subvol = 0; + uint32_t curr_size; + uint32_t range_size; + uint64_t total_size = 0; + int real_i; + dht_conf_t *priv; + gf_boolean_t weight_by_size; + int bricks_used = 0; + + this = frame->this; + priv = this->private; + weight_by_size = priv->do_weighting; + + bricks_to_use = dht_get_layout_count(this, layout, 1); + GF_ASSERT(bricks_to_use > 0); + + bricks_used = 0; + for (i = 0; i < layout->cnt; ++i) { + err = layout->list[i].err; + if ((err != -1) && (err != ENOENT)) { + continue; + } + curr_size = dht_get_chunks_from_xl(this, layout->list[i].xlator); + if (!curr_size) { + weight_by_size = _gf_false; + break; + } + total_size += curr_size; + if (++bricks_used >= bricks_to_use) { + break; + } + } + + if (weight_by_size && total_size) { + /* We know total_size is not zero. */ + chunk = ((double)0xffffffff) / ((double)total_size); + gf_msg_debug(this->name, 0, + "chunk size = 0xffffffff / %" PRIu64 " = %f", total_size, + chunk); + } else { + weight_by_size = _gf_false; + chunk = ((unsigned long)0xffffffff) / bricks_to_use; + } + + start_subvol = dht_selfheal_layout_alloc_start(this, loc, layout); + + /* clear out the range, as we are re-computing here */ + DHT_RESET_LAYOUT_RANGE(layout); + + /* + * OK, what's this "real_i" stuff about? This used to be two loops - + * from start_subvol to layout->cnt-1, then from 0 to start_subvol-1. + * That way is practically an open invitation to bugs when only one + * of the loops is updated. Using real_i and modulo operators to make + * it one loop avoids this problem. Remember, folks: it's everyone's + * responsibility to help stamp out copy/paste abuse. + */ + bricks_used = 0; + for (real_i = 0; real_i < layout->cnt; real_i++) { + i = (real_i + start_subvol) % layout->cnt; + err = layout->list[i].err; + if ((err != -1) && (err != ENOENT)) { + continue; + } + if (weight_by_size) { + curr_size = dht_get_chunks_from_xl(this, layout->list[i].xlator); + if (!curr_size) { + continue; + } + } else { + curr_size = 1; + } + range_size = chunk * curr_size; + gf_msg_debug(this->name, 0, "assigning range size 0x%x to %s", + range_size, layout->list[i].xlator->name); + DHT_SET_LAYOUT_RANGE(layout, i, start, range_size, loc->path); + if (++bricks_used >= bricks_to_use) { + layout->list[i].stop = 0xffffffff; + goto done; + } + start += range_size; + } - gf_log (this->name, GF_LOG_TRACE, - "gave fix: %u - %u on %s for %s", - layout->list[i].start, layout->list[i].stop, - layout->list[i].xlator->name, loc->path); - if (--cnt == 0) { - layout->list[i].stop = 0xffffffff; - break; - } - } +done: + return; +} + +static int +dht_selfheal_dir_getafix(call_frame_t *frame, loc_t *loc, dht_layout_t *layout) +{ + dht_local_t *local = NULL; + uint32_t holes = 0; + int ret = -1; + int i = -1; + uint32_t overlaps = 0; + + local = frame->local; + + holes = local->selfheal.hole_cnt; + overlaps = local->selfheal.overlaps_cnt; + + if (holes || overlaps) { + /* If the layout has anomalies which would change the hash + * ranges, then we need to reset the commit_hash for this + * directory, as the layout would change and things may not + * be in place as expected */ + layout->commit_hash = DHT_LAYOUT_HASH_INVALID; + dht_selfheal_layout_new_directory(frame, loc, layout); + ret = 0; + } + + for (i = 0; i < layout->cnt; i++) { + /* directory not present */ + if (layout->list[i].err == ENOENT) { + ret = 0; + break; } + } + + /* TODO: give a fix to these non-virgins */ + + return ret; } +int +dht_selfheal_new_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + dht_layout_t *layout) +{ + dht_local_t *local = NULL; + int ret = 0; + inode_t *linked_inode = NULL, *inode = NULL; + loc_t *loc = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + char gfid[GF_UUID_BUF_SIZE] = {0}; + int32_t op_errno = EIO; + + local = frame->local; + + loc = &local->loc; + + gf_uuid_unparse(local->stbuf.ia_gfid, gfid); + gf_uuid_unparse(loc->parent->gfid, pgfid); + + linked_inode = inode_link(loc->inode, loc->parent, loc->name, + &local->stbuf); + if (!linked_inode) { + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, DHT_MSG_LINK_INODE_FAILED, + "pgfid=%s", pgfid, "name=%s", loc->name, "gfid=%s", gfid, NULL); + ret = -1; + goto out; + } + + inode = loc->inode; + loc->inode = linked_inode; + inode_unref(inode); + + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = dht_layout_ref(frame->this, layout); + + dht_layout_sort_volname(layout); + dht_selfheal_layout_new_directory(frame, &local->loc, layout); + + op_errno = ENOMEM; + ret = dht_selfheal_layout_lock(frame, layout, _gf_true, + dht_selfheal_dir_xattr, + dht_should_heal_layout); + +out: + if (ret < 0) { + dir_cbk(frame, NULL, frame->this, -1, op_errno, NULL); + } + + return 0; +} int -dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout) +dht_fix_directory_layout(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + dht_layout_t *layout) { - dht_conf_t *conf = NULL; - xlator_t *this = NULL; - dht_local_t *local = NULL; - int missing = -1; - int down = -1; - int holes = -1; - int ret = -1; - int i = -1; - int overlaps = -1; + dht_local_t *local = NULL; + dht_layout_t *tmp_layout = NULL; + int ret = 0; - this = frame->this; - conf = this->private; - local = frame->local; + local = frame->local; - missing = local->selfheal.missing; - down = local->selfheal.down; - holes = local->selfheal.hole_cnt; - overlaps = local->selfheal.overlaps_cnt; + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = dht_layout_ref(frame->this, layout); - if ((missing + down) == conf->subvolume_cnt) { - dht_selfheal_layout_new_directory (frame, loc, layout); - ret = 0; - } + /* No layout sorting required here */ + tmp_layout = dht_fix_layout_of_directory(frame, &local->loc, layout); + if (!tmp_layout) { + return -1; + } + + ret = dht_selfheal_layout_lock(frame, tmp_layout, _gf_false, + dht_fix_dir_xattr, dht_should_fix_layout); - if (holes <= down) { - /* the down subvol might fill up the holes */ - ret = 0; + return ret; +} + +int +dht_selfheal_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + loc_t *loc, dht_layout_t *layout) +{ + dht_local_t *local = NULL; + xlator_t *this = NULL; + uint32_t down = 0; + uint32_t misc = 0; + int ret = 0; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + char gfid[GF_UUID_BUF_SIZE] = {0}; + inode_t *linked_inode = NULL, *inode = NULL; + + local = frame->local; + this = frame->this; + + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = dht_layout_ref(this, layout); + + if (local->need_attrheal) { + if (__is_root_gfid(local->stbuf.ia_gfid)) { + local->stbuf.ia_gid = local->prebuf.ia_gid; + local->stbuf.ia_uid = local->prebuf.ia_uid; + + local->stbuf.ia_ctime = local->prebuf.ia_ctime; + local->stbuf.ia_ctime_nsec = local->prebuf.ia_ctime_nsec; + local->stbuf.ia_prot = local->prebuf.ia_prot; + + } else if (!IA_ISINVAL(local->mds_stbuf.ia_type)) { + local->stbuf = local->mds_stbuf; + } + } + + if (!__is_root_gfid(local->stbuf.ia_gfid)) { + gf_uuid_unparse(local->stbuf.ia_gfid, gfid); + gf_uuid_unparse(loc->parent->gfid, pgfid); + + linked_inode = inode_link(loc->inode, loc->parent, loc->name, + &local->stbuf); + if (!linked_inode) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LINK_INODE_FAILED, + "pgfid=%s", pgfid, "name=%s", loc->name, "gfid=%s", gfid, + NULL); + ret = 0; + goto sorry_no_fix; } - if (holes || overlaps) { - dht_selfheal_layout_new_directory (frame, loc, layout); - ret = 0; + inode = loc->inode; + loc->inode = linked_inode; + inode_unref(inode); + } + + if (local->need_xattr_heal && (local->mds_xattr)) { + dht_dir_set_heal_xattr(this, local, local->xattr, local->mds_xattr, + NULL, NULL); + dict_unref(local->mds_xattr); + local->mds_xattr = NULL; + } + + dht_layout_anomalies(this, loc, layout, &local->selfheal.hole_cnt, + &local->selfheal.overlaps_cnt, + &local->selfheal.missing_cnt, &local->selfheal.down, + &local->selfheal.misc, NULL); + + down = local->selfheal.down; + misc = local->selfheal.misc; + + if (down) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SELFHEAL_FAILED, + "path=%s", loc->path, "subvol-down=%d", down, "Not-fixing", + "gfid=%s", gfid, NULL); + ret = 0; + goto sorry_no_fix; + } + + if (misc) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SELFHEAL_FAILED, + "path=%s", loc->path, "misc=%d", misc, "unrecoverable-errors", + "gfid=%s", gfid, NULL); + + ret = 0; + goto sorry_no_fix; + } + + dht_layout_sort_volname(layout); + local->heal_layout = _gf_true; + + /* Ignore return value as it can be inferred from result of + * dht_layout_anomalies + */ + dht_selfheal_dir_getafix(frame, loc, layout); + + if (!(local->selfheal.hole_cnt || local->selfheal.overlaps_cnt || + local->selfheal.missing_cnt)) { + local->heal_layout = _gf_false; + } + + ret = dht_selfheal_dir_mkdir(frame, loc, layout, 0); + if (ret < 0) { + ret = 0; + goto sorry_no_fix; + } + + return 0; + +sorry_no_fix: + /* TODO: need to put appropriate local->op_errno */ + dht_selfheal_dir_finish(frame, this, ret, 1); + + return 0; +} + +int +dht_selfheal_restore(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + loc_t *loc, dht_layout_t *layout) +{ + int ret = 0; + dht_local_t *local = NULL; + + local = frame->local; + + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = dht_layout_ref(frame->this, layout); + + ret = dht_selfheal_dir_mkdir(frame, loc, layout, 1); + + return ret; +} + +int +dht_dir_heal_xattrs(void *data) +{ + call_frame_t *frame = NULL; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + xlator_t *mds_subvol = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dict_t *user_xattr = NULL; + dict_t *internal_xattr = NULL; + dict_t *mds_xattr = NULL; + dict_t *xdata = NULL; + int call_cnt = 0; + int ret = -1; + int uret = 0; + int uflag = 0; + int i = 0; + int xattr_hashed = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + int32_t allzero[1] = {0}; + + GF_VALIDATE_OR_GOTO("dht", data, out); + + frame = data; + local = frame->local; + this = frame->this; + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, local, out); + mds_subvol = local->mds_subvol; + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, out); + gf_uuid_unparse(local->loc.gfid, gfid); + + if (!mds_subvol) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_MDS_SUBVOL, "path=%s", + local->loc.path, "gfid=%s", gfid, NULL); + goto out; + } + + if ((local->loc.inode && gf_uuid_is_null(local->loc.inode->gfid)) || + gf_uuid_is_null(local->loc.gfid)) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_NOT_PRESENT, + "skip-heal path=%s", local->loc.path, "gfid=%s", gfid, NULL); + goto out; + } + + internal_xattr = dict_new(); + if (!internal_xattr) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CREATE_FAILED, + "dictionary", NULL); + goto out; + } + xdata = dict_new(); + if (!xdata) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CREATE_FAILED, + "dictionary", NULL); + goto out; + } + + call_cnt = conf->subvolume_cnt; + + user_xattr = dict_new(); + if (!user_xattr) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CREATE_FAILED, + "dictionary", NULL); + goto out; + } + + ret = syncop_listxattr(local->mds_subvol, &local->loc, &mds_xattr, NULL, + NULL); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LIST_XATTRS_FAILED, + "path=%s", local->loc.path, "name=%s", local->mds_subvol->name, + NULL); + } + + if (!mds_xattr) + goto out; + + dht_dir_set_heal_xattr(this, local, user_xattr, mds_xattr, &uret, &uflag); + + /* To set quota related xattr need to set GLUSTERFS_INTERNAL_FOP_KEY + * key value to 1 + */ + if (dict_get(user_xattr, QUOTA_LIMIT_KEY) || + dict_get(user_xattr, QUOTA_LIMIT_OBJECTS_KEY)) { + ret = dict_set_int32(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "key=%s", GLUSTERFS_INTERNAL_FOP_KEY, "path=%s", + local->loc.path, NULL); + goto out; } + } + if (uret <= 0 && !uflag) + goto out; + + for (i = 0; i < call_cnt; i++) { + subvol = conf->subvolumes[i]; + if (subvol == mds_subvol) + continue; + if (uret || uflag) { + /* Custom xattr heal is required - let posix handle it */ + ret = dict_set_int8(xdata, "sync_backend_xattrs", _gf_true); + if (ret) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", local->loc.path, "key=%s", + "sync_backend_xattrs", NULL); + goto out; + } + + ret = syncop_setxattr(subvol, &local->loc, user_xattr, 0, xdata, + NULL); + if (ret) { + xattr_hashed = 1; + gf_smsg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_DIR_XATTR_HEAL_FAILED, + "set-user-xattr-failed path=%s", local->loc.path, + "subvol=%s", subvol->name, "gfid=%s", gfid, NULL); + } else { + dict_del(xdata, "sync_backend_xattrs"); + } + } + } + /* After heal all custom xattr reset internal MDS xattr to 0 */ + if (!xattr_hashed) { + ret = dht_dict_set_array(internal_xattr, conf->mds_xattr_key, allzero, + 1); + if (ret) { + gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "key=%s", conf->mds_xattr_key, "path=%s", local->loc.path, + NULL); + goto out; + } + ret = syncop_setxattr(mds_subvol, &local->loc, internal_xattr, 0, NULL, + NULL); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s", local->loc.path, + "subvol=%s", mds_subvol->name, "gfid=%s", gfid, NULL); + } + } - for (i = 0; i < layout->cnt; i++) { - /* directory not present */ - if (layout->list[i].err == ENOENT) { - ret = 0; - break; +out: + if (user_xattr) + dict_unref(user_xattr); + if (mds_xattr) + dict_unref(mds_xattr); + if (internal_xattr) + dict_unref(internal_xattr); + if (xdata) + dict_unref(xdata); + return 0; +} + +int +dht_dir_heal_xattrs_done(int ret, call_frame_t *sync_frame, void *data) +{ + DHT_STACK_DESTROY(sync_frame); + return 0; +} + +int +dht_dir_attr_heal(void *data) +{ + call_frame_t *frame = NULL; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + xlator_t *mds_subvol = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + int call_cnt = 0; + int ret = -1; + int i = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + GF_VALIDATE_OR_GOTO("dht", data, out); + + frame = data; + local = frame->local; + this = frame->this; + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", local, out); + conf = this->private; + GF_VALIDATE_OR_GOTO("dht", conf, out); + + mds_subvol = local->mds_subvol; + call_cnt = conf->subvolume_cnt; + + if (!__is_root_gfid(local->stbuf.ia_gfid) && (!mds_subvol)) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_MDS_SUBVOL, "path=%s", + local->loc.path, "gfid=%s", gfid, NULL); + goto out; + } + + if (!__is_root_gfid(local->stbuf.ia_gfid)) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == mds_subvol) { + if (!conf->subvolume_status[i]) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_MDS_DOWN_UNABLE_TO_SET, "path=%s", + local->loc.path, "gfid=%s", gfid, NULL); + goto out; } + } + } + } + + for (i = 0; i < call_cnt; i++) { + subvol = conf->subvolumes[i]; + if (!subvol || subvol == mds_subvol) + continue; + if (__is_root_gfid(local->stbuf.ia_gfid)) { + ret = syncop_setattr( + subvol, &local->loc, &local->stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL, + NULL, NULL, NULL); + } else { + ret = syncop_setattr( + subvol, &local->loc, &local->mds_stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL, + NULL, NULL, NULL); } - /* TODO: give a fix to these non-virgins */ + if (ret) { + gf_uuid_unparse(local->loc.gfid, gfid); - return ret; + gf_smsg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_DIR_ATTR_HEAL_FAILED, "path=%s", local->loc.path, + "subvol=%s", subvol->name, "gfid=%s", gfid, NULL); + } + } +out: + return 0; } int -dht_selfheal_new_directory (call_frame_t *frame, - dht_selfheal_dir_cbk_t dir_cbk, - dht_layout_t *layout) +dht_dir_attr_heal_done(int ret, call_frame_t *sync_frame, void *data) { - dht_local_t *local = NULL; + DHT_STACK_DESTROY(sync_frame); + return 0; +} - local = frame->local; +/* EXIT: dht_update_commit_hash_for_layout */ +static int +dht_update_commit_hash_for_layout_done(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; - local->selfheal.dir_cbk = dir_cbk; - local->selfheal.layout = dht_layout_ref (frame->this, layout); + local = frame->local; - dht_layout_sort_volname (layout); - dht_selfheal_layout_new_directory (frame, &local->loc, layout); - dht_selfheal_dir_xattr (frame, &local->loc, layout); - return 0; + /* preserve oldest error */ + if (op_ret && !local->op_ret) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + + DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno, NULL); + + return 0; } +static int +dht_update_commit_hash_for_layout_unlock(call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + int ret = 0; + + local = frame->local; + + ret = dht_unlock_inodelk(frame, local->lock[0].layout.my_layout.locks, + local->lock[0].layout.my_layout.lk_count, + dht_update_commit_hash_for_layout_done); + if (ret < 0) { + /* preserve oldest error, just ... */ + if (!local->op_ret) { + local->op_errno = errno; + local->op_ret = -1; + } -int -dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, - loc_t *loc, dht_layout_t *layout) + gf_smsg(this->name, GF_LOG_WARNING, errno, DHT_MSG_WIND_UNLOCK_FAILED, + "path=%s", local->loc.path, NULL); + + dht_update_commit_hash_for_layout_done(frame, NULL, this, 0, 0, NULL); + } + + return 0; +} + +static int +dht_update_commit_hash_for_layout_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xdata) { - dht_local_t *local = NULL; - uint32_t holes = 0; - uint32_t down = 0; - uint32_t misc = 0; - int ret = 0; - xlator_t *this = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; - local = frame->local; - this = frame->this; + local = frame->local; - dht_layout_anomalies (this, loc, layout, - &local->selfheal.hole_cnt, - &local->selfheal.overlaps_cnt, - &local->selfheal.missing, - &local->selfheal.down, - &local->selfheal.misc); + LOCK(&frame->lock); + /* store first failure, just because */ + if (op_ret && !local->op_ret) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + UNLOCK(&frame->lock); - holes = local->selfheal.hole_cnt; - down = local->selfheal.down; - misc = local->selfheal.misc; + this_call_cnt = dht_frame_return(frame); - local->selfheal.dir_cbk = dir_cbk; - local->selfheal.layout = dht_layout_ref (this, layout); + if (is_last_call(this_call_cnt)) { + dht_update_commit_hash_for_layout_unlock(frame, this); + } - if (down) { - gf_log (this->name, GF_LOG_INFO, - "%d subvolumes down -- not fixing", down); - ret = 0; - goto sorry_no_fix; - } + return 0; +} - if (misc) { - gf_log (this->name, GF_LOG_INFO, - "%d subvolumes have unrecoverable errors", misc); - ret = 0; - goto sorry_no_fix; +static int +dht_update_commit_hash_for_layout_resume(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int count = 1, ret = -1, i = 0, j = 0; + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + int32_t *disk_layout = NULL; + dict_t **xattr = NULL; + + local = frame->local; + conf = frame->this->private; + count = conf->local_subvols_cnt; + layout = local->layout; + + if (op_ret < 0) { + goto err_done; + } + + /* We precreate the xattr list as we cannot change call count post the + * first wind as we may never continue from there. So we finish prep + * work before winding the setxattrs */ + xattr = GF_CALLOC(count, sizeof(*xattr), gf_common_mt_char); + if (!xattr) { + local->op_errno = errno; + + gf_smsg(this->name, GF_LOG_WARNING, errno, DHT_MSG_COMMIT_HASH_FAILED, + "allocation-failed path=%s", local->loc.path, NULL); + + goto err; + } + + for (i = 0; i < count; i++) { + /* find the layout index for the subvolume */ + ret = dht_layout_index_for_subvol(layout, conf->local_subvols[i]); + if (ret < 0) { + local->op_errno = ENOENT; + + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_COMMIT_HASH_FAILED, + "path=%s", local->loc.path, "subvol=%s", + conf->local_subvols[i]->name, "find-disk-layout-failed", + NULL); + + goto err; } + j = ret; - dht_layout_sort_volname (layout); - ret = dht_selfheal_dir_getafix (frame, loc, layout); + /* update the commit hash for the layout */ + layout->list[j].commit_hash = layout->commit_hash; + /* extract the current layout */ + ret = dht_disk_layout_extract(this, layout, j, &disk_layout); if (ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "not able to form layout for the directory"); - goto sorry_no_fix; + local->op_errno = errno; + + gf_smsg(this->name, GF_LOG_WARNING, errno, + DHT_MSG_COMMIT_HASH_FAILED, "path=%s", local->loc.path, + "subvol=%s", conf->local_subvols[i]->name, + "extract-disk-layout-failed", NULL); + + goto err; } - dht_selfheal_dir_mkdir (frame, loc, layout, 0); + xattr[i] = dict_new(); + if (!xattr[i]) { + local->op_errno = errno; - return 0; + gf_smsg(this->name, GF_LOG_WARNING, errno, + DHT_MSG_COMMIT_HASH_FAILED, "path=%s Allocation-failed", + local->loc.path, NULL); -sorry_no_fix: - /* TODO: need to put appropriate local->op_errno */ - dht_selfheal_dir_finish (frame, this, ret); + goto err; + } - return 0; -} + ret = dict_set_bin(xattr[i], conf->xattr_name, disk_layout, 4 * 4); + if (ret != 0) { + local->op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "path=%s", + local->loc.path, "subvol=%s", conf->local_subvols[i]->name, + "set-xattr-failed", NULL); -int -dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, - loc_t *loc, dht_layout_t *layout) -{ - int ret = 0; - dht_local_t *local = NULL; + goto err; + } + disk_layout = NULL; + + gf_msg_trace(this->name, 0, + "setting commit hash %u on subvolume %s" + " for %s", + layout->list[j].commit_hash, conf->local_subvols[i]->name, + local->loc.path); + } + + /* wind the setting of the commit hash across the local subvols */ + local->call_cnt = count; + local->op_ret = 0; + local->op_errno = 0; + for (i = 0; i < count; i++) { + STACK_WIND(frame, dht_update_commit_hash_for_layout_cbk, + conf->local_subvols[i], + conf->local_subvols[i]->fops->setxattr, &local->loc, + xattr[i], 0, NULL); + } + for (i = 0; i < count; i++) + dict_unref(xattr[i]); + GF_FREE(xattr); + + return 0; +err: + if (xattr) { + for (i = 0; i < count; i++) { + if (xattr[i]) + dict_unref(xattr[i]); + } + + GF_FREE(xattr); + } + + GF_FREE(disk_layout); + + local->op_ret = -1; + + dht_update_commit_hash_for_layout_unlock(frame, this); - local = frame->local; + return 0; +err_done: + local->op_ret = -1; - local->selfheal.dir_cbk = dir_cbk; - local->selfheal.layout = dht_layout_ref (frame->this, layout); + dht_update_commit_hash_for_layout_done(frame, NULL, this, 0, 0, NULL); - ret = dht_selfheal_dir_mkdir (frame, loc, layout, 1); + return 0; +} + +/* ENTER: dht_update_commit_hash_for_layout (see EXIT above) + * This function is invoked from rebalance only. + * As a result, the check here is simple enough to see if defrag is present + * in the conf, as other data would be populated appropriately if so. + * If ever this was to be used in other code paths, checks would need to + * change. + * + * Functional details: + * - Lock the inodes on the subvols that we want the commit hash updated + * - Update each layout with the inode layout, modified to take in the new + * commit hash. + * - Unlock and return. + */ +int +dht_update_commit_hash_for_layout(call_frame_t *frame) +{ + dht_local_t *local = NULL; + int count = 1, ret = -1, i = 0; + dht_lock_t **lk_array = NULL; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err); + + local = frame->local; + conf = frame->this->private; + + if (!conf->defrag) + goto err; + + count = conf->local_subvols_cnt; + lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_char); + if (lk_array == NULL) + goto err; + + for (i = 0; i < count; i++) { + lk_array[i] = dht_lock_new(frame->this, conf->local_subvols[i], + &local->loc, F_WRLCK, DHT_LAYOUT_HEAL_DOMAIN, + NULL, FAIL_ON_ANY_ERROR); + if (lk_array[i] == NULL) + goto err; + } + + local->lock[0].layout.my_layout.locks = lk_array; + local->lock[0].layout.my_layout.lk_count = count; + + ret = dht_blocking_inodelk(frame, lk_array, count, + dht_update_commit_hash_for_layout_resume); + if (ret < 0) { + local->lock[0].layout.my_layout.locks = NULL; + local->lock[0].layout.my_layout.lk_count = 0; + goto err; + } + + return 0; +err: + if (lk_array != NULL) { + dht_lock_array_free(lk_array, count); + GF_FREE(lk_array); + } - return ret; + return -1; } diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c new file mode 100644 index 00000000000..bb72b0ffbb5 --- /dev/null +++ b/xlators/cluster/dht/src/dht-shared.c @@ -0,0 +1,1104 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +/* TODO: add NS locking */ +#include <glusterfs/statedump.h> +#include "dht-common.h" +#include "dht-messages.h" + +#ifndef MAX +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) +#endif + +/* TODO: + - use volumename in xattr instead of "dht" + - use NS locks + - handle all cases in self heal layout reconstruction + - complete linkfile selfheal +*/ + +static void +dht_layout_dump(dht_layout_t *layout, const char *prefix) +{ + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + + if (!layout) + goto out; + + gf_proc_dump_build_key(key, prefix, "cnt"); + gf_proc_dump_write(key, "%d", layout->cnt); + gf_proc_dump_build_key(key, prefix, "preset"); + gf_proc_dump_write(key, "%d", layout->preset); + gf_proc_dump_build_key(key, prefix, "gen"); + gf_proc_dump_write(key, "%d", layout->gen); + if (layout->type != IA_INVAL) { + gf_proc_dump_build_key(key, prefix, "inode type"); + gf_proc_dump_write(key, "%d", layout->type); + } + + if (!IA_ISDIR(layout->type)) + goto out; + + for (i = 0; i < layout->cnt; i++) { + gf_proc_dump_build_key(key, prefix, "list[%d].err", i); + gf_proc_dump_write(key, "%d", layout->list[i].err); + gf_proc_dump_build_key(key, prefix, "list[%d].start", i); + gf_proc_dump_write(key, "0x%x", layout->list[i].start); + gf_proc_dump_build_key(key, prefix, "list[%d].stop", i); + gf_proc_dump_write(key, "0x%x", layout->list[i].stop); + if (layout->list[i].xlator) { + gf_proc_dump_build_key(key, prefix, "list[%d].xlator.type", i); + gf_proc_dump_write(key, "%s", layout->list[i].xlator->type); + gf_proc_dump_build_key(key, prefix, "list[%d].xlator.name", i); + gf_proc_dump_write(key, "%s", layout->list[i].xlator->name); + } + } + +out: + return; +} + +int32_t +dht_priv_dump(xlator_t *this) +{ + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + dht_conf_t *conf = NULL; + int ret = -1; + + if (!this) + goto out; + + conf = this->private; + if (!conf) + goto out; + + ret = TRY_LOCK(&conf->subvolume_lock); + if (ret != 0) { + return ret; + } + + gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name); + gf_proc_dump_build_key(key_prefix, "xlator.cluster.dht", "%s.priv", + this->name); + gf_proc_dump_write("subvol_cnt", "%d", conf->subvolume_cnt); + for (i = 0; i < conf->subvolume_cnt; i++) { + snprintf(key, sizeof(key), "subvolumes[%d]", i); + gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type, + conf->subvolumes[i]->name); + if (conf->file_layouts && conf->file_layouts[i]) { + snprintf(key, sizeof(key), "file_layouts[%d]", i); + dht_layout_dump(conf->file_layouts[i], key); + } + if (conf->dir_layouts && conf->dir_layouts[i]) { + snprintf(key, sizeof(key), "dir_layouts[%d]", i); + dht_layout_dump(conf->dir_layouts[i], key); + } + if (conf->subvolume_status) { + snprintf(key, sizeof(key), "subvolume_status[%d]", i); + gf_proc_dump_write(key, "%d", (int)conf->subvolume_status[i]); + } + } + + gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed); + gf_proc_dump_write("gen", "%d", conf->gen); + gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk); + gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes); + gf_proc_dump_write("disk_unit", "%c", conf->disk_unit); + gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval); + gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit); + gf_proc_dump_write("use-readdirp", "%d", conf->use_readdirp); + + if (conf->du_stats && conf->subvolume_status) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->subvolume_status[i]) + continue; + + snprintf(key, sizeof(key), "subvolumes[%d]", i); + gf_proc_dump_write(key, "%s", conf->subvolumes[i]->name); + + snprintf(key, sizeof(key), "du_stats[%d].avail_percent", i); + gf_proc_dump_write(key, "%lf", conf->du_stats[i].avail_percent); + + snprintf(key, sizeof(key), "du_stats[%d].avail_space", i); + gf_proc_dump_write(key, "%" PRIu64, conf->du_stats[i].avail_space); + + snprintf(key, sizeof(key), "du_stats[%d].avail_inodes", i); + gf_proc_dump_write(key, "%lf", conf->du_stats[i].avail_inodes); + + snprintf(key, sizeof(key), "du_stats[%d].log", i); + gf_proc_dump_write(key, "%" PRIu32, conf->du_stats[i].log); + } + } + + if (conf->last_stat_fetch) + gf_proc_dump_write("last_stat_fetch", "%s", + ctime(&conf->last_stat_fetch)); + + UNLOCK(&conf->subvolume_lock); + +out: + return ret; +} + +int32_t +dht_inodectx_dump(xlator_t *this, inode_t *inode) +{ + int ret = -1; + dht_layout_t *layout = NULL; + + if (!this) + goto out; + if (!inode) + goto out; + + ret = dht_inode_ctx_layout_get(inode, this, &layout); + + if ((ret != 0) || !layout) + return ret; + + gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name); + dht_layout_dump(layout, "layout"); + +out: + return ret; +} + +void +dht_fini(xlator_t *this) +{ + int i = 0; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO("dht", this, out); + + conf = this->private; + this->private = NULL; + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + GF_FREE(conf->file_layouts[i]); + } + GF_FREE(conf->file_layouts); + } + + dict_unref(conf->leaf_to_subvol); + + /* allocated in dht_init_subvolumes() */ + GF_FREE(conf->subvolumes); + GF_FREE(conf->subvolume_status); + GF_FREE(conf->last_event); + GF_FREE(conf->subvol_up_time); + GF_FREE(conf->du_stats); + GF_FREE(conf->decommissioned_bricks); + + /* allocated in dht_init() */ + GF_FREE(conf->mds_xattr_key); + GF_FREE(conf->link_xattr_name); + GF_FREE(conf->commithash_xattr_name); + GF_FREE(conf->wild_xattr_name); + + /* allocated in dht_init_regex() */ + if (conf->rsync_regex_valid) + regfree(&conf->rsync_regex); + if (conf->extra_regex_valid) + regfree(&conf->extra_regex); + + synclock_destroy(&conf->link_lock); + + if (conf->lock_pool) + mem_pool_destroy(conf->lock_pool); + + GF_FREE(conf); + } +out: + return; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO("dht", this, out); + + ret = xlator_mem_acct_init(this, gf_dht_mt_end + 1); + + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY, + "Memory accounting init failed"); + return ret; + } +out: + return ret; +} + +static int +dht_parse_decommissioned_bricks(xlator_t *this, dht_conf_t *conf, + const char *bricks) +{ + int i = 0; + int ret = -1; + char *tmpstr = NULL; + char *dup_brick = NULL; + char *node = NULL; + + if (!conf || !bricks) + goto out; + + dup_brick = gf_strdup(bricks); + if (dup_brick == NULL) { + goto out; + } + + node = strtok_r(dup_brick, ",", &tmpstr); + while (node) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!strcmp(conf->subvolumes[i]->name, node)) { + conf->decommissioned_bricks[i] = conf->subvolumes[i]; + conf->decommission_subvols_cnt++; + gf_msg(this->name, GF_LOG_INFO, 0, + DHT_MSG_SUBVOL_DECOMMISSION_INFO, + "decommissioning subvolume %s", + conf->subvolumes[i]->name); + break; + } + } + if (i == conf->subvolume_cnt) { + /* Wrong node given. */ + goto out; + } + node = strtok_r(NULL, ",", &tmpstr); + } + + ret = 0; + conf->decommission_in_progress = 1; +out: + GF_FREE(dup_brick); + + return ret; +} + +static void +dht_decommissioned_remove(xlator_t *this, dht_conf_t *conf) +{ + int i = 0; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i]) { + conf->decommissioned_bricks[i] = NULL; + conf->decommission_subvols_cnt--; + } + } +} + +static void +dht_init_regex(xlator_t *this, dict_t *odict, char *name, regex_t *re, + gf_boolean_t *re_valid, dht_conf_t *conf) +{ + char *temp_str = NULL; + + if (dict_get_str(odict, name, &temp_str) != 0) { + if (strcmp(name, "rsync-hash-regex")) { + return; + } + temp_str = "^\\.(.+)\\.[^.]+$"; + } + + LOCK(&conf->lock); + { + if (*re_valid) { + regfree(re); + *re_valid = _gf_false; + } + + if (!strcmp(temp_str, "none")) { + goto unlock; + } + + if (regcomp(re, temp_str, REG_EXTENDED) == 0) { + gf_msg_debug(this->name, 0, "using regex %s = %s", name, temp_str); + *re_valid = _gf_true; + } else { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_REGEX_INFO, + "compiling regex %s failed", temp_str); + } + } +unlock: + UNLOCK(&conf->lock); +} + +int +dht_set_subvol_range(xlator_t *this) +{ + int ret = -1; + dht_conf_t *conf = NULL; + + conf = this->private; + + if (!conf) + goto out; + + conf->leaf_to_subvol = dict_new(); + if (!conf->leaf_to_subvol) + goto out; + + ret = glusterfs_reachable_leaves(this, conf->leaf_to_subvol); + +out: + return ret; +} + +static int +dht_configure_throttle(xlator_t *this, dht_conf_t *conf, char *temp_str) +{ + int rebal_thread_count = 0; + int ret = 0; + + pthread_mutex_lock(&conf->defrag->dfq_mutex); + { + if (!strcasecmp(temp_str, "lazy")) { + conf->defrag->recon_thread_count = 1; + } else if (!strcasecmp(temp_str, "normal")) { + conf->defrag->recon_thread_count = 2; + } else if (!strcasecmp(temp_str, "aggressive")) { + conf->defrag->recon_thread_count = MAX(MAX_REBAL_THREADS - 4, 4); + } else if ((gf_string2int(temp_str, &rebal_thread_count) == 0)) { + if ((rebal_thread_count > 0) && + (rebal_thread_count <= MAX_REBAL_THREADS)) { + conf->defrag->recon_thread_count = rebal_thread_count; + pthread_mutex_unlock(&conf->defrag->dfq_mutex); + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "rebal thread count configured to %d", + rebal_thread_count); + goto out; + } else { + pthread_mutex_unlock(&conf->defrag->dfq_mutex); + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option: Reconfigure: " + "rebal-throttle should be " + "within range of 0 and maximum number of" + " cores available"); + ret = -1; + goto out; + } + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option: Reconfigure: " + "rebal-throttle should be {lazy|normal|aggressive}" + " or a number up to the number of cores available," + " not (%s), defaulting to (%d)", + temp_str, conf->dthrottle); + ret = -1; + } + } + pthread_mutex_unlock(&conf->defrag->dfq_mutex); + +out: + return ret; +} + +int +dht_reconfigure(xlator_t *this, dict_t *options) +{ + dht_conf_t *conf = NULL; + char *temp_str = NULL; + gf_boolean_t search_unhashed; + int ret = -1; + + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", options, out); + + conf = this->private; + if (!conf) + return 0; + + if (dict_get_str(options, "lookup-unhashed", &temp_str) == 0) { + /* If option is not "auto", other options _should_ be boolean*/ + if (strcasecmp(temp_str, "auto")) { + if (!gf_string2boolean(temp_str, &search_unhashed)) { + gf_msg_debug(this->name, 0, + "Reconfigure: " + "lookup-unhashed reconfigured(%s)", + temp_str); + conf->search_unhashed = search_unhashed; + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option: Reconfigure: " + "lookup-unhashed should be boolean," + " not (%s), defaulting to (%d)", + temp_str, conf->search_unhashed); + ret = -1; + goto out; + } + } else { + gf_msg_debug(this->name, 0, + "Reconfigure:" + " lookup-unhashed reconfigured auto "); + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; + } + } + + GF_OPTION_RECONF("lookup-optimize", conf->lookup_optimize, options, bool, + out); + + GF_OPTION_RECONF("min-free-disk", conf->min_free_disk, options, + percent_or_size, out); + /* option can be any one of percent or bytes */ + conf->disk_unit = 0; + if (conf->min_free_disk < 100.0) + conf->disk_unit = 'p'; + + GF_OPTION_RECONF("min-free-inodes", conf->min_free_inodes, options, percent, + out); + + GF_OPTION_RECONF("directory-layout-spread", conf->dir_spread_cnt, options, + uint32, out); + + GF_OPTION_RECONF("readdir-optimize", conf->readdir_optimize, options, bool, + out); + GF_OPTION_RECONF("randomize-hash-range-by-gfid", conf->randomize_by_gfid, + options, bool, out); + + GF_OPTION_RECONF("lock-migration", conf->lock_migration_enabled, options, + bool, out); + + GF_OPTION_RECONF("force-migration", conf->force_migration, options, bool, + out); + + if (conf->defrag) { + if (dict_get_str(options, "rebal-throttle", &temp_str) == 0) { + ret = dht_configure_throttle(this, conf, temp_str); + if (ret == -1) + goto out; + } + } + + if (conf->defrag) { + conf->defrag->lock_migration_enabled = conf->lock_migration_enabled; + } + + if (conf->defrag) { + GF_OPTION_RECONF("rebalance-stats", conf->defrag->stats, options, bool, + out); + } + + if (dict_get_str(options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks(this, conf, temp_str); + if (ret == -1) + goto out; + } else { + dht_decommissioned_remove(this, conf); + } + + dht_init_regex(this, options, "rsync-hash-regex", &conf->rsync_regex, + &conf->rsync_regex_valid, conf); + dht_init_regex(this, options, "extra-hash-regex", &conf->extra_regex, + &conf->extra_regex_valid, conf); + + GF_OPTION_RECONF("weighted-rebalance", conf->do_weighting, options, bool, + out); + + GF_OPTION_RECONF("use-readdirp", conf->use_readdirp, options, bool, out); + ret = 0; +out: + return ret; +} + +static int +gf_defrag_pattern_list_fill(xlator_t *this, gf_defrag_info_t *defrag, + char *data) +{ + int ret = -1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *num = NULL; + char *pattern_str = NULL; + char *pattern = NULL; + gf_defrag_pattern_list_t *temp_list = NULL; + gf_defrag_pattern_list_t *pattern_list = NULL; + + if (!this || !defrag || !data) + goto out; + + /* Get the pattern for pattern list. "pattern:<optional-size>" + * eg: *avi, *pdf:10MB, *:1TB + */ + pattern_str = strtok_r(data, ",", &tmp_str); + while (pattern_str) { + dup_str = gf_strdup(pattern_str); + if (!dup_str) + goto out; + pattern_list = GF_CALLOC(1, sizeof(gf_defrag_pattern_list_t), 1); + if (!pattern_list) { + goto out; + } + pattern = strtok_r(dup_str, ":", &tmp_str1); + num = strtok_r(NULL, ":", &tmp_str1); + if (!pattern) + goto out; + if (!num) { + if (gf_string2bytesize_uint64(pattern, &pattern_list->size) == 0) { + pattern = "*"; + } + } else if (gf_string2bytesize_uint64(num, &pattern_list->size) != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option. Defrag pattern:" + " Invalid number format \"%s\"", + num); + goto out; + } + memcpy(pattern_list->path_pattern, pattern, strlen(dup_str)); + + if (!defrag->defrag_pattern) + temp_list = NULL; + else + temp_list = defrag->defrag_pattern; + + pattern_list->next = temp_list; + + defrag->defrag_pattern = pattern_list; + pattern_list = NULL; + + GF_FREE(dup_str); + dup_str = NULL; + + pattern_str = strtok_r(NULL, ",", &tmp_str); + } + + ret = 0; +out: + if (ret) + GF_FREE(pattern_list); + GF_FREE(dup_str); + + return ret; +} + +static int +dht_init_methods(xlator_t *this) +{ + int ret = -1; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; + + GF_VALIDATE_OR_GOTO("dht", this, err); + + conf = this->private; + methods = &(conf->methods); + + methods->migration_get_dst_subvol = dht_migration_get_dst_subvol; + methods->migration_other = NULL; + methods->layout_search = dht_layout_search; + + ret = 0; +err: + return ret; +} + +int +dht_init(xlator_t *this) +{ + dht_conf_t *conf = NULL; + char *temp_str = NULL; + int ret = -1; + int i = 0; + gf_defrag_info_t *defrag = NULL; + int cmd = 0; + char *node_uuid = NULL; + uint32_t commit_hash = 0; + + GF_VALIDATE_OR_GOTO("dht", this, err); + + if (!this->children) { + gf_msg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_INVALID_CONFIGURATION, + "Distribute needs more than one subvolume"); + return -1; + } + + if (!this->parents) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_CONFIGURATION, + "dangling volume. check volfile"); + } + + conf = GF_CALLOC(1, sizeof(*conf), gf_dht_mt_dht_conf_t); + if (!conf) { + goto err; + } + + LOCK_INIT(&conf->subvolume_lock); + LOCK_INIT(&conf->layout_lock); + LOCK_INIT(&conf->lock); + synclock_init(&conf->link_lock, SYNC_LOCK_DEFAULT); + + /* We get the commit-hash to set only for rebalance process */ + if (dict_get_uint32(this->options, "commit-hash", &commit_hash) == 0) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_COMMIT_HASH_INFO, + "%s using commit hash %u", __func__, commit_hash); + conf->vol_commit_hash = commit_hash; + conf->vch_forced = _gf_true; + } + + ret = dict_get_int32(this->options, "rebalance-cmd", &cmd); + + if (cmd) { + defrag = GF_CALLOC(1, sizeof(gf_defrag_info_t), gf_defrag_info_mt); + + GF_VALIDATE_OR_GOTO(this->name, defrag, err); + + LOCK_INIT(&defrag->lock); + + defrag->is_exiting = 0; + + conf->defrag = defrag; + defrag->this = this; + + ret = dict_get_str(this->options, "node-uuid", &node_uuid); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_CONFIGURATION, + "Invalid volume configuration: " + "node-uuid not specified"); + goto err; + } + + if (gf_uuid_parse(node_uuid, defrag->node_uuid)) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option:" + " Cannot parse glusterd node uuid"); + goto err; + } + + defrag->cmd = cmd; + + defrag->stats = _gf_false; + + defrag->queue = NULL; + + defrag->crawl_done = 0; + + defrag->global_error = 0; + + defrag->q_entry_count = 0; + + defrag->wakeup_crawler = 0; + + pthread_mutex_init(&defrag->dfq_mutex, 0); + pthread_cond_init(&defrag->parallel_migration_cond, 0); + pthread_cond_init(&defrag->rebalance_crawler_alarm, 0); + pthread_cond_init(&defrag->df_wakeup_thread, 0); + + pthread_mutex_init(&defrag->fc_mutex, 0); + pthread_cond_init(&defrag->fc_wakeup_cond, 0); + + defrag->global_error = 0; + } + + conf->use_fallocate = 1; + + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; + if (dict_get_str(this->options, "lookup-unhashed", &temp_str) == 0) { + /* If option is not "auto", other options _should_ be boolean */ + if (strcasecmp(temp_str, "auto")) { + gf_boolean_t search_unhashed_bool; + ret = gf_string2boolean(temp_str, &search_unhashed_bool); + if (ret == -1) { + goto err; + } + conf->search_unhashed = search_unhashed_bool + ? GF_DHT_LOOKUP_UNHASHED_ON + : GF_DHT_LOOKUP_UNHASHED_OFF; + } else { + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; + } + } + + GF_OPTION_INIT("lookup-optimize", conf->lookup_optimize, bool, err); + + GF_OPTION_INIT("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool, err); + + GF_OPTION_INIT("use-readdirp", conf->use_readdirp, bool, err); + + GF_OPTION_INIT("min-free-disk", conf->min_free_disk, percent_or_size, err); + + GF_OPTION_INIT("min-free-inodes", conf->min_free_inodes, percent, err); + + conf->dir_spread_cnt = conf->subvolume_cnt; + GF_OPTION_INIT("directory-layout-spread", conf->dir_spread_cnt, uint32, + err); + + GF_OPTION_INIT("assert-no-child-down", conf->assert_no_child_down, bool, + err); + + GF_OPTION_INIT("readdir-optimize", conf->readdir_optimize, bool, err); + + GF_OPTION_INIT("lock-migration", conf->lock_migration_enabled, bool, err); + + GF_OPTION_INIT("force-migration", conf->force_migration, bool, err); + + if (defrag) { + defrag->lock_migration_enabled = conf->lock_migration_enabled; + + GF_OPTION_INIT("rebalance-stats", defrag->stats, bool, err); + if (dict_get_str(this->options, "rebalance-filter", &temp_str) == 0) { + if (gf_defrag_pattern_list_fill(this, defrag, temp_str) == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option:" + " Cannot parse rebalance-filter (%s)", + temp_str); + + goto err; + } + } + } + + /* option can be any one of percent or bytes */ + conf->disk_unit = 0; + if (conf->min_free_disk < 100) + conf->disk_unit = 'p'; + + ret = dht_init_subvolumes(this, conf); + if (ret == -1) { + goto err; + } + + if (cmd) { + ret = dht_init_local_subvolumes(this, conf); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_INIT_LOCAL_SUBVOL_FAILED, + "dht_init_local_subvolumes failed"); + goto err; + } + } + + if (dict_get_str(this->options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks(this, conf, temp_str); + if (ret == -1) + goto err; + } + + dht_init_regex(this, this->options, "rsync-hash-regex", &conf->rsync_regex, + &conf->rsync_regex_valid, conf); + dht_init_regex(this, this->options, "extra-hash-regex", &conf->extra_regex, + &conf->extra_regex_valid, conf); + + ret = dht_layouts_init(this, conf); + if (ret == -1) { + goto err; + } + + conf->gen = 1; + + this->local_pool = mem_pool_new(dht_local_t, 512); + if (!this->local_pool) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + " DHT initialisation failed. " + "failed to create local_t's memory pool"); + goto err; + } + + GF_OPTION_INIT("randomize-hash-range-by-gfid", conf->randomize_by_gfid, + bool, err); + + if (defrag) { + GF_OPTION_INIT("rebal-throttle", temp_str, str, err); + if (temp_str) { + ret = dht_configure_throttle(this, conf, temp_str); + if (ret == -1) + goto err; + } + } + + GF_OPTION_INIT("xattr-name", conf->xattr_name, str, err); + gf_asprintf(&conf->mds_xattr_key, "%s." DHT_MDS_STR, conf->xattr_name); + gf_asprintf(&conf->link_xattr_name, "%s." DHT_LINKFILE_STR, + conf->xattr_name); + gf_asprintf(&conf->commithash_xattr_name, "%s." DHT_COMMITHASH_STR, + conf->xattr_name); + gf_asprintf(&conf->wild_xattr_name, "%s*", conf->xattr_name); + if (!conf->link_xattr_name || !conf->wild_xattr_name) { + goto err; + } + + GF_OPTION_INIT("weighted-rebalance", conf->do_weighting, bool, err); + + conf->lock_pool = mem_pool_new(dht_lock_t, 512); + if (!conf->lock_pool) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INIT_FAILED, + "failed to create lock mem_pool, failing " + "initialization"); + goto err; + } + + this->private = conf; + + if (dht_set_subvol_range(this)) + goto err; + + if (dht_init_methods(this)) + goto err; + + return 0; + +err: + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + GF_FREE(conf->file_layouts[i]); + } + GF_FREE(conf->file_layouts); + } + + GF_FREE(conf->subvolumes); + + GF_FREE(conf->subvolume_status); + + GF_FREE(conf->du_stats); + + GF_FREE(conf->defrag); + + GF_FREE(conf->xattr_name); + GF_FREE(conf->link_xattr_name); + GF_FREE(conf->wild_xattr_name); + GF_FREE(conf->mds_xattr_key); + + if (conf->lock_pool) + mem_pool_destroy(conf->lock_pool); + + GF_FREE(conf); + } + + return -1; +} + +struct volume_options dht_options[] = { + { + .key = {"lookup-unhashed"}, + .value = {"auto", "yes", "no", "enable", "disable", "1", "0", "on", + "off"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "on", + .description = + "This option if set to ON, does a lookup through " + "all the sub-volumes, in case a lookup didn't return any result " + "from the hash subvolume. If set to OFF, it does not do a lookup " + "on the remaining subvolumes.", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, + .level = OPT_STATUS_BASIC, + }, + {.key = {"lookup-optimize"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = + "This option if set to ON enables the optimization " + "of -ve lookups, by not doing a lookup on non-hashed subvolumes for " + "files, in case the hashed subvolume does not return any result. " + "This option disregards the lookup-unhashed setting, when enabled.", + .op_version = {GD_OP_VERSION_3_7_2}, + .level = OPT_STATUS_ADVANCED, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"min-free-disk"}, + .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, + .default_value = "10%", + .description = + "Percentage/Size of disk space, after which the " + "process starts balancing out the cluster, and logs will appear " + "in log files", + .op_version = {1}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"min-free-inodes"}, + .type = GF_OPTION_TYPE_PERCENT, + .default_value = "5%", + .description = "after system has only N% of inodes, warnings " + "starts to appear in log files", + .op_version = {1}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + { + .key = {"unhashed-sticky-bit"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, + {.key = {"use-readdirp"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "This option if set to ON, forces the use of " + "readdirp, and hence also displays the stats of the files.", + .level = OPT_STATUS_ADVANCED, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"assert-no-child-down"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON, in the event of " + "CHILD_DOWN, will call exit."}, + { + .key = {"directory-layout-spread"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .validate = GF_OPT_VALIDATE_MIN, + .description = "Specifies the directory layout spread. Takes number " + "of subvolumes as default value.", + + .op_version = {2}, + }, + { + .key = {"decommissioned-bricks"}, + .type = GF_OPTION_TYPE_ANY, + .description = + "This option if set to ON, decommissions " + "the brick, so that no new data is allowed to be created " + "on that brick.", + .level = OPT_STATUS_ADVANCED, + }, + { + .key = {"rebalance-cmd"}, + .type = GF_OPTION_TYPE_INT, + }, + { + .key = {"commit-hash"}, + .type = GF_OPTION_TYPE_INT, + }, + { + .key = {"node-uuid"}, + .type = GF_OPTION_TYPE_STR, + }, + { + .key = {"rebalance-stats"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = + "This option if set to ON displays and logs the " + " time taken for migration of each file, during the rebalance " + "process. If set to OFF, the rebalance logs will only display the " + "time spent in each directory.", + .op_version = {2}, + .level = OPT_STATUS_BASIC, + }, + {.key = {"readdir-optimize"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = + "This option if set to ON enables the optimization " + "that allows DHT to requests non-first subvolumes to filter out " + "directory entries.", + .op_version = {1}, + .level = OPT_STATUS_ADVANCED, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"rsync-hash-regex"}, + .type = GF_OPTION_TYPE_STR, + /* Setting a default here doesn't work. See dht_init_regex. */ + .description = + "Regular expression for stripping temporary-file " + "suffix and prefix used by rsync, to prevent relocation when the " + "file is renamed.", + .op_version = {3}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"extra-hash-regex"}, + .type = GF_OPTION_TYPE_STR, + /* Setting a default here doesn't work. See dht_init_regex. */ + .description = + "Regular expression for stripping temporary-file " + "suffix and prefix used by an application, to prevent relocation when " + "the file is renamed.", + .op_version = {3}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + { + .key = {"rebalance-filter"}, + .type = GF_OPTION_TYPE_STR, + }, + + { + .key = {"xattr-name"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "trusted.glusterfs.dht", + .description = + "Base for extended attributes used by this " + "translator instance, to avoid conflicts with others above or " + "below it.", + .op_version = {3}, + }, + + {.key = {"weighted-rebalance"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = + "When enabled, files will be allocated to bricks " + "with a probability proportional to their size. Otherwise, all " + "bricks will have the same probability (legacy behavior).", + .op_version = {GD_OP_VERSION_3_6_0}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + + /* NUFA option */ + {.key = {"local-volume-name"}, .type = GF_OPTION_TYPE_XLATOR}, + + /* switch option */ + {.key = {"pattern.switch.case"}, .type = GF_OPTION_TYPE_ANY}, + + { + .key = {"randomize-hash-range-by-gfid"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = + "Use gfid of directory to determine the subvolume " + "from which hash ranges are allocated starting with 0. " + "Note that we still use a directory/file's name to determine the " + "subvolume to which it hashes", + .op_version = {GD_OP_VERSION_3_6_0}, + }, + + {.key = {"rebal-throttle"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "normal", + .description = " Sets the maximum number of parallel file migrations " + "allowed on a node during the rebalance operation. The" + " default value is normal and allows a max of " + "[($(processing units) - 4) / 2), 2] files to be " + "migrated at a time. Lazy will allow only one file to " + "be migrated at a time and aggressive will allow " + "max of [($(processing units) - 4) / 2), 4]", + .op_version = {GD_OP_VERSION_3_7_0}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC + + }, + + {.key = {"lock-migration"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = " If enabled this feature will migrate the posix locks" + " associated with a file during rebalance", + .op_version = {GD_OP_VERSION_3_8_0}, + .level = OPT_STATUS_ADVANCED, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + + {.key = {"force-migration"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "If disabled, rebalance will not migrate files that " + "are being written to by an application", + .op_version = {GD_OP_VERSION_4_0_0}, + .level = OPT_STATUS_ADVANCED, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + + {.key = {NULL}}, +}; + +#define NUM_DHT_OPTIONS (sizeof(dht_options) / sizeof(dht_options[0])) + +extern struct volume_options options[NUM_DHT_OPTIONS] + __attribute__((alias("dht_options"))); diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c index c9b77d64428..53de8292704 100644 --- a/xlators/cluster/dht/src/dht.c +++ b/xlators/cluster/dht/src/dht.c @@ -1,547 +1,123 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -/* TODO: add NS locking */ - -#include "statedump.h" -#include "dht-common.c" - -/* TODO: - - use volumename in xattr instead of "dht" - - use NS locks - - handle all cases in self heal layout reconstruction - - complete linkfile selfheal + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - -void -dht_layout_dump (dht_layout_t *layout, const char *prefix) -{ - - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 0; - - GF_VALIDATE_OR_GOTO ("dht", layout, out); - GF_VALIDATE_OR_GOTO ("dht", prefix, out); - - gf_proc_dump_build_key(key, prefix, "cnt"); - gf_proc_dump_write(key, "%d", layout->cnt); - gf_proc_dump_build_key(key, prefix, "preset"); - gf_proc_dump_write(key, "%d", layout->preset); - gf_proc_dump_build_key(key, prefix, "gen"); - gf_proc_dump_write(key, "%d", layout->gen); - gf_proc_dump_build_key(key, prefix, "type"); - gf_proc_dump_write(key, "%d", layout->type); - - for (i = 0; i < layout->cnt; i++) { - gf_proc_dump_build_key(key, prefix,"list[%d].err", i); - gf_proc_dump_write(key, "%d", layout->list[i].err); - gf_proc_dump_build_key(key, prefix,"list[%d].start", i); - gf_proc_dump_write(key, "%u", layout->list[i].start); - gf_proc_dump_build_key(key, prefix,"list[%d].stop", i); - gf_proc_dump_write(key, "%u", layout->list[i].stop); - if (layout->list[i].xlator) { - gf_proc_dump_build_key(key, prefix, - "list[%d].xlator.type", i); - gf_proc_dump_write(key, "%s", - layout->list[i].xlator->type); - gf_proc_dump_build_key(key, prefix, - "list[%d].xlator.name", i); - gf_proc_dump_write(key, "%s", - layout->list[i].xlator->name); - } - } - -out: - return; -} - - -int32_t -dht_priv_dump (xlator_t *this) -{ - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 0; - dht_conf_t *conf = NULL; - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - conf = this->private; - - if (!conf) - return -1; - - ret = TRY_LOCK(&conf->subvolume_lock); - - if (ret != 0) { - gf_log("", GF_LOG_WARNING, "Unable to lock dht subvolume %s", - this->name); - return ret; - } - - gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name); - gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv", - this->name); - gf_proc_dump_build_key(key, key_prefix, "subvolume_cnt"); - gf_proc_dump_write(key,"%d", conf->subvolume_cnt); - for (i = 0; i < conf->subvolume_cnt; i++) { - gf_proc_dump_build_key(key, key_prefix, "subvolumes[%d]", i); - gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type, - conf->subvolumes[i]->name); - if (conf->file_layouts && conf->file_layouts[i]){ - gf_proc_dump_build_key(key, key_prefix, - "file_layouts[%d]",i); - dht_layout_dump(conf->file_layouts[i], key); - } - if (conf->dir_layouts && conf->dir_layouts[i]) { - gf_proc_dump_build_key(key, key_prefix, - "dir_layouts[%d]",i); - dht_layout_dump(conf->dir_layouts[i], key); - } - if (conf->subvolume_status) { - gf_proc_dump_build_key(key, key_prefix, - "subvolume_status[%d]", i); - gf_proc_dump_write(key, "%d", - (int)conf->subvolume_status[i]); - } - - } - - gf_proc_dump_build_key(key, key_prefix,"default_dir_layout"); - dht_layout_dump(conf->default_dir_layout, key); - - gf_proc_dump_build_key(key, key_prefix, "search_unhashed"); - gf_proc_dump_write(key, "%d", conf->search_unhashed); - gf_proc_dump_build_key(key, key_prefix, "gen"); - gf_proc_dump_write(key, "%d", conf->gen); - gf_proc_dump_build_key(key, key_prefix, "min_free_disk"); - gf_proc_dump_write(key, "%lu", conf->min_free_disk); - gf_proc_dump_build_key(key, key_prefix, "disk_unit"); - gf_proc_dump_write(key, "%c", conf->disk_unit); - gf_proc_dump_build_key(key, key_prefix, "refresh_interval"); - gf_proc_dump_write(key, "%d", conf->refresh_interval); - gf_proc_dump_build_key(key, key_prefix, "unhashed_sticky_bit"); - gf_proc_dump_write(key, "%d", conf->unhashed_sticky_bit); - if (conf ->du_stats) { - gf_proc_dump_build_key(key, key_prefix, - "du_stats.avail_percent"); - gf_proc_dump_write(key, "%lf", conf->du_stats->avail_percent); - gf_proc_dump_build_key(key, key_prefix, - "du_stats.avail_space"); - gf_proc_dump_write(key, "%lu", conf->du_stats->avail_space); - gf_proc_dump_build_key(key, key_prefix, - "du_stats.log"); - gf_proc_dump_write(key, "%lu", conf->du_stats->log); - } - gf_proc_dump_build_key(key, key_prefix, "last_stat_fetch"); - gf_proc_dump_write(key, "%s", ctime(&conf->last_stat_fetch.tv_sec)); - - UNLOCK(&conf->subvolume_lock); - -out: - return ret; -} - -int32_t -dht_inodectx_dump (xlator_t *this, inode_t *inode) -{ - int ret = -1; - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - dht_layout_t *layout = NULL; - uint64_t tmp_layout = 0; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", inode, out); - - ret = inode_ctx_get (inode, this, &tmp_layout); - - if (ret != 0) - return ret; - - layout = (dht_layout_t *)(long)tmp_layout; - - if (!layout) - return -1; - - gf_proc_dump_build_key(key_prefix, "xlator.cluster.dht", - "%s.inode.%ld", this->name, inode->ino); - dht_layout_dump(layout, key_prefix); - -out: - return ret; -} - -int -notify (xlator_t *this, int event, void *data, ...) -{ - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - ret = dht_notify (this, event, data); - -out: - return ret; -} - -void -fini (xlator_t *this) -{ - int i = 0; - dht_conf_t *conf = NULL; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - conf = this->private; - this->private = NULL; - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->default_dir_layout) - GF_FREE (conf->default_dir_layout); - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - GF_FREE (conf); - } -out: - return; -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } -out: - return ret; -} - -int -validate_options (xlator_t *this, char **op_errstr) -{ - int ret = 0; - volume_opt_list_t *vol_opt = NULL; - volume_opt_list_t *tmp; - - if (!this) { - gf_log (this->name, GF_LOG_DEBUG, "'this' not a valid ptr"); - ret =-1; - goto out; - } - - if (list_empty (&this->volume_options)) - goto out; - - vol_opt = list_entry (this->volume_options.next, - volume_opt_list_t, list); - list_for_each_entry_safe (vol_opt, tmp, &this->volume_options, list) { - ret = validate_xlator_volume_options_attacherr (this, - vol_opt->given_opt, - op_errstr); - } - -out: - - return ret; -} - -int -reconfigure (xlator_t *this, dict_t *options) -{ - dht_conf_t *conf = NULL; - char *temp_str = NULL; - gf_boolean_t search_unhashed; - uint32_t temp_free_disk = 0; - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", options, out); - - conf = this->private; - if (!conf) - return 0; - - if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean*/ - if (strcasecmp (temp_str, "auto")) { - if (!gf_string2boolean (temp_str, &search_unhashed)) { - gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" - " lookup-unahashed reconfigured (%s)", - temp_str); - conf->search_unhashed = search_unhashed; - } else { - gf_log(this->name, GF_LOG_ERROR, "Reconfigure:" - " lookup-unahashed should be boolean," - " not (%s), defaulting to (%d)", - temp_str, conf->search_unhashed); - //return -1; - ret = -1; - goto out; - } - } else { - gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" - " lookup-unahashed reconfigured auto "); - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - } - - if (dict_get_str (options, "min-free-disk", &temp_str) == 0) { - if (gf_string2percent (temp_str, &temp_free_disk) == 0) { - if (temp_free_disk > 100) { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } else { - conf->min_free_disk = (uint64_t)temp_free_disk; - } - } else { - gf_string2bytesize (temp_str, &conf->min_free_disk); - conf->disk_unit = 'b'; - } - - gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" - " min-free-disk reconfigured to %s", - temp_str); - } - ret = 0; -out: - return ret; -} - -int -init (xlator_t *this) -{ - dht_conf_t *conf = NULL; - char *temp_str = NULL; - int ret = -1; - int i = 0; - uint32_t temp_free_disk = 0; - - GF_VALIDATE_OR_GOTO ("dht", this, err); - - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "Distribute needs more than one subvolume"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } - - conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t); - if (!conf) { - goto err; - } - - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean */ - if (strcasecmp (temp_str, "auto")) - gf_string2boolean (temp_str, &conf->search_unhashed); - else - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - - conf->unhashed_sticky_bit = 0; - - if (dict_get_str (this->options, "unhashed-sticky-bit", - &temp_str) == 0) { - gf_string2boolean (temp_str, &conf->unhashed_sticky_bit); - } - - conf->use_readdirp = 1; - - if (dict_get_str (this->options, "use-readdirp", - &temp_str) == 0) { - gf_string2boolean (temp_str, &conf->use_readdirp); - } - - conf->disk_unit = 'p'; - conf->min_free_disk = 10; - - if (dict_get_str (this->options, "min-free-disk", &temp_str) == 0) { - if (gf_string2percent (temp_str, &temp_free_disk) == 0) { - if (temp_free_disk > 100) { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } else { - conf->min_free_disk = (uint64_t)temp_free_disk; - } - } else { - gf_string2bytesize (temp_str, &conf->min_free_disk); - conf->disk_unit = 'b'; - } - } - - - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; - } - - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; - } - - conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), - gf_dht_mt_dht_du_t); - if (!conf->du_stats) { - goto err; - } - - LOCK_INIT (&conf->subvolume_lock); - LOCK_INIT (&conf->layout_lock); - - conf->gen = 1; - - this->private = conf; - - return 0; - -err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->default_dir_layout) - GF_FREE (conf->default_dir_layout); - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - if (conf->du_stats) - GF_FREE (conf->du_stats); - - GF_FREE (conf); - } - - return -1; -} - +#include "dht-common.h" + +struct xlator_fops dht_pt_fops = { + /* we need to keep mkdir to make sure we + have layout on new directory */ + .mkdir = dht_pt_mkdir, + .getxattr = dht_pt_getxattr, + .fgetxattr = dht_pt_fgetxattr, + + /* required to trace fop properly in changelog */ + .rename = dht_pt_rename, + + /* FIXME: commenting the '.lookup()' below made some of + the failing tests to pass. I would remove the below + line, but keeping it here as a reminder for people + to check for issues if they find concerns with DHT + pass-through logic */ + /* + .lookup = dht_lookup, + .readdir = dht_readdir, + .readdirp = dht_readdirp, + */ + /* Keeping above as commented, mainly to support the + usecase of a gluster volume getting to 1x(anytype), + due to remove-brick (shrinking) exercise. In that case, + we would need above fops to be available, so we can + handle the case of dangling linkto files (if any) */ +}; struct xlator_fops fops = { - .lookup = dht_lookup, - .mknod = dht_mknod, - .create = dht_create, - - .stat = dht_stat, - .fstat = dht_fstat, - .truncate = dht_truncate, - .ftruncate = dht_ftruncate, - .access = dht_access, - .readlink = dht_readlink, - .setxattr = dht_setxattr, - .fsetxattr = dht_fsetxattr, - .getxattr = dht_getxattr, - .removexattr = dht_removexattr, - .open = dht_open, - .readv = dht_readv, - .writev = dht_writev, - .flush = dht_flush, - .fsync = dht_fsync, - .statfs = dht_statfs, - .lk = dht_lk, - .opendir = dht_opendir, - .readdir = dht_readdir, - .readdirp = dht_readdirp, - .fsyncdir = dht_fsyncdir, - .symlink = dht_symlink, - .unlink = dht_unlink, - .link = dht_link, - .mkdir = dht_mkdir, - .rmdir = dht_rmdir, - .rename = dht_rename, - .inodelk = dht_inodelk, - .finodelk = dht_finodelk, - .entrylk = dht_entrylk, - .fentrylk = dht_fentrylk, - .xattrop = dht_xattrop, - .fxattrop = dht_fxattrop, - .setattr = dht_setattr, - .fsetattr = dht_fsetattr, + .ipc = dht_ipc, + .lookup = dht_lookup, + .mknod = dht_mknod, + .create = dht_create, + + .open = dht_open, + .statfs = dht_statfs, + .opendir = dht_opendir, + .readdir = dht_readdir, + .readdirp = dht_readdirp, + .fsyncdir = dht_fsyncdir, + .symlink = dht_symlink, + .unlink = dht_unlink, + .link = dht_link, + .mkdir = dht_mkdir, + .rmdir = dht_rmdir, + .rename = dht_rename, + .entrylk = dht_entrylk, + .fentrylk = dht_fentrylk, + + /* Inode read operations */ + .stat = dht_stat, + .fstat = dht_fstat, + .access = dht_access, + .readlink = dht_readlink, + .getxattr = dht_getxattr, + .fgetxattr = dht_fgetxattr, + .readv = dht_readv, + .flush = dht_flush, + .fsync = dht_fsync, + .inodelk = dht_inodelk, + .finodelk = dht_finodelk, + .lk = dht_lk, + .lease = dht_lease, + + /* Inode write operations */ + .fremovexattr = dht_fremovexattr, + .removexattr = dht_removexattr, + .setxattr = dht_setxattr, + .fsetxattr = dht_fsetxattr, + .truncate = dht_truncate, + .ftruncate = dht_ftruncate, + .writev = dht_writev, + .xattrop = dht_xattrop, + .fxattrop = dht_fxattrop, + .setattr = dht_setattr, + .fsetattr = dht_fsetattr, + .fallocate = dht_fallocate, + .discard = dht_discard, + .zerofill = dht_zerofill, }; struct xlator_dumpops dumpops = { - .priv = dht_priv_dump, - .inodectx = dht_inodectx_dump, + .priv = dht_priv_dump, + .inodectx = dht_inodectx_dump, }; - struct xlator_cbks cbks = { -// .release = dht_release, -// .releasedir = dht_releasedir, - .forget = dht_forget + .release = dht_release, + // .releasedir = dht_releasedir, + .forget = dht_forget, }; - -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .value = {"auto", "yes", "no", "enable", "disable", "1", "0", - "on", "off"}, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - }, - { .key = {"unhashed-sticky-bit"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"use-readdirp"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {NULL} }, +extern int32_t +mem_acct_init(xlator_t *this); + +extern struct volume_options dht_options[]; + +xlator_api_t xlator_api = { + .init = dht_init, + .fini = dht_fini, + .notify = dht_notify, + .reconfigure = dht_reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = dht_options, + .identifier = "distribute", + .pass_through_fops = &dht_pt_fops, + .category = GF_MAINTAINED, }; diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c index 6f14362f49f..3648a564840 100644 --- a/xlators/cluster/dht/src/nufa.c +++ b/xlators/cluster/dht/src/nufa.c @@ -1,746 +1,657 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "dht-common.c" +#include "dht-common.h" /* TODO: all 'TODO's in dht.c holds good */ +extern struct volume_options dht_options[]; + int -nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) +nufa_local_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - xlator_t *subvol = NULL; - char is_linkfile = 0; - char is_dir = 0; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - loc_t *loc = NULL; - int i = 0; - call_frame_t *prev = NULL; - int call_cnt = 0; - int ret = 0; - - - conf = this->private; - - prev = cookie; - local = frame->local; - loc = &local->loc; - - if (ENTRY_MISSING (op_ret, op_errno)) { - if (conf->search_unhashed) { - local->op_errno = ENOENT; - dht_lookup_everywhere (frame, this, loc); - return 0; - } + xlator_t *subvol = NULL; + char is_linkfile = 0; + char is_dir = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; + int i = 0; + xlator_t *prev = NULL; + int call_cnt = 0; + int ret = 0; + + conf = this->private; + + prev = cookie; + local = frame->local; + loc = &local->loc; + + if (ENTRY_MISSING(op_ret, op_errno)) { + if (conf->search_unhashed) { + local->op_errno = ENOENT; + dht_lookup_everywhere(frame, this, loc); + return 0; + } + } + + if (op_ret == -1) + goto out; + + is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name); + is_dir = check_is_dir(inode, stbuf, xattr); + + if (!is_dir && !is_linkfile) { + /* non-directory and not a linkfile */ + ret = dht_layout_preset(this, prev, inode); + if (ret < 0) { + gf_msg_debug(this->name, 0, + "could not set pre-set layout for subvol" + " %s", + prev->name); + op_ret = -1; + op_errno = EINVAL; + goto err; } - if (op_ret == -1) - goto out; + goto out; + } - is_linkfile = check_is_linkfile (inode, stbuf, xattr); - is_dir = check_is_dir (inode, stbuf, xattr); + if (is_dir) { + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; - if (!is_dir && !is_linkfile) { - /* non-directory and not a linkfile */ + local->inode = inode_ref(inode); + local->xattr = dict_ref(xattr); - dht_itransform (this, prev->this, stbuf->ia_ino, - &stbuf->ia_ino); + local->op_ret = 0; + local->op_errno = 0; - ret = dht_layout_preset (this, prev->this, inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not set pre-set layout for subvol %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto err; - } - - goto out; + local->layout = dht_layout_new(this, conf->subvolume_cnt); + if (!local->layout) { + op_ret = -1; + op_errno = ENOMEM; + goto err; } - if (is_dir) { - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; - - local->inode = inode_ref (inode); - local->xattr = dict_ref (xattr); - - local->op_ret = 0; - local->op_errno = 0; - - local->layout = dht_layout_new (this, conf->subvolume_cnt); - if (!local->layout) { - op_ret = -1; - op_errno = ENOMEM; - goto err; - } - - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - &local->loc, local->xattr_req); - } + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, + local->xattr_req); } + } - if (is_linkfile) { - subvol = dht_linkfile_subvol (this, inode, stbuf, xattr); - - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "linkfile not having link subvolume. path=%s", - loc->path); - dht_lookup_everywhere (frame, this, loc); - return 0; - } + if (is_linkfile) { + subvol = dht_linkfile_subvol(this, inode, stbuf, xattr); - STACK_WIND (frame, dht_lookup_linkfile_cbk, - subvol, subvol->fops->lookup, - &local->loc, local->xattr_req); + if (!subvol) { + gf_msg_debug(this->name, 0, + "linkfile has no link subvolume. path=%s", loc->path); + dht_lookup_everywhere(frame, this, loc); + return 0; } - return 0; + STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol, + subvol->fops->lookup, &local->loc, local->xattr_req); + } + + return 0; out: - if (!local->hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - local->loc.path); - local->op_errno = ENOENT; - dht_lookup_everywhere (frame, this, loc); - return 0; - } + if (!local->hashed_subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + local->loc.path); + local->op_errno = ENOENT; + dht_lookup_everywhere(frame, this, loc); + return 0; + } - STACK_WIND (frame, dht_lookup_cbk, - local->hashed_subvol, local->hashed_subvol->fops->lookup, - &local->loc, local->xattr_req); + STACK_WIND_COOKIE(frame, dht_lookup_cbk, local->hashed_subvol, + local->hashed_subvol, local->hashed_subvol->fops->lookup, + &local->loc, local->xattr_req); - return 0; + return 0; err: - DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, - inode, stbuf, xattr, NULL); - return 0; + DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr, + postparent); + return 0; } int -nufa_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) +nufa_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; - xlator_t *subvol = NULL; - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - int op_errno = -1; - dht_layout_t *layout = NULL; - int i = 0; - int call_cnt = 0; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - conf = this->private; - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; + xlator_t *hashed_subvol = NULL; + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + + conf = this->private; + + local = dht_local_init(frame, loc, NULL, GF_FOP_LOOKUP); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + if (xattr_req) { + local->xattr_req = dict_ref(xattr_req); + } else { + local->xattr_req = dict_new(); + } + + hashed_subvol = dht_subvol_get_hashed(this, &local->loc); + + local->hashed_subvol = hashed_subvol; + + if (is_revalidate(loc)) { + layout = local->layout; + if (!layout) { + gf_msg_debug(this->name, 0, + "revalidate lookup without cache. " + "path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + if (layout->gen && (layout->gen < conf->gen)) { + gf_msg_debug(this->name, 0, "incomplete layout failure for path=%s", + loc->path); + dht_layout_unref(this, local->layout); + goto do_fresh_lookup; + } + + local->inode = inode_ref(loc->inode); + + local->call_cnt = layout->cnt; + call_cnt = local->call_cnt; + + /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, + * revalidates directly go to the cached-subvolume. + */ + ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dict value."); + op_errno = -1; + goto err; } - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "copying location failed for path=%s", - loc->path); - goto err; - } + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND_COOKIE(frame, dht_revalidate_cbk, subvol, subvol, + subvol->fops->lookup, loc, local->xattr_req); - if (xattr_req) { - local->xattr_req = dict_ref (xattr_req); - } else { - local->xattr_req = dict_new (); + if (!--call_cnt) + break; + } + } else { + do_fresh_lookup: + ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dict value."); + op_errno = -1; + goto err; } - hashed_subvol = dht_subvol_get_hashed (this, &local->loc); - cached_subvol = dht_subvol_get_cached (this, local->loc.inode); - - local->cached_subvol = cached_subvol; - local->hashed_subvol = hashed_subvol; - - if (is_revalidate (loc)) { - local->layout = layout = dht_layout_get (this, loc->inode); - - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "revalidate without cache. path=%s", - loc->path); - op_errno = EINVAL; - goto err; - } - - if (layout->gen && (layout->gen < conf->gen)) { - gf_log (this->name, GF_LOG_DEBUG, - "incomplete layout failure for path=%s", - loc->path); - dht_layout_unref (this, local->layout); - goto do_fresh_lookup; - } - - local->inode = inode_ref (loc->inode); - local->ia_ino = loc->inode->ino; - - local->call_cnt = layout->cnt; - call_cnt = local->call_cnt; - - /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, - * revalidates directly go to the cached-subvolume. - */ - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to set dict value."); - op_errno = -1; - goto err; - } - - for (i = 0; i < layout->cnt; i++) { - subvol = layout->list[i].xlator; - - STACK_WIND (frame, dht_revalidate_cbk, - subvol, subvol->fops->lookup, - loc, local->xattr_req); - - if (!--call_cnt) - break; - } - } else { - do_fresh_lookup: - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to set dict value."); - op_errno = -1; - goto err; - } - - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht.linkto", 256); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to set dict value."); - op_errno = -1; - goto err; - } - - /* Send it to only local volume */ - STACK_WIND (frame, nufa_local_lookup_cbk, - (xlator_t *)conf->private, - ((xlator_t *)conf->private)->fops->lookup, - loc, local->xattr_req); + ret = dict_set_uint32(local->xattr_req, conf->link_xattr_name, 256); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dict value."); + op_errno = -1; + goto err; } - return 0; + /* Send it to only local volume */ + STACK_WIND_COOKIE( + frame, nufa_local_lookup_cbk, ((xlator_t *)conf->private), + ((xlator_t *)conf->private), + ((xlator_t *)conf->private)->fops->lookup, loc, local->xattr_req); + } + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); - return 0; + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; } int -nufa_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, - struct iatt *postparent) +nufa_create_linkfile_create_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret == -1) - goto err; + if (op_ret == -1) + goto err; - STACK_WIND (frame, dht_create_cbk, - local->cached_subvol, local->cached_subvol->fops->create, - &local->loc, local->flags, local->mode, local->fd, - local->params); + STACK_WIND_COOKIE(frame, dht_create_cbk, local->cached_subvol, + local->cached_subvol, local->cached_subvol->fops->create, + &local->loc, local->flags, local->mode, local->umask, + local->fd, local->params); - return 0; + return 0; err: - DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - return 0; + DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; } int -nufa_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) +nufa_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *params) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - xlator_t *subvol = NULL; - xlator_t *avail_subvol = NULL; - int op_errno = -1; - int ret = -1; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + xlator_t *avail_subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + conf = this->private; + + dht_get_du_info(frame, this, loc); + + local = dht_local_init(frame, loc, fd, GF_FOP_CREATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = dht_subvol_get_hashed(this, loc); + if (!subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + avail_subvol = conf->private; + if (dht_is_subvol_filled(this, (xlator_t *)conf->private)) { + avail_subvol = dht_free_disk_available_subvol( + this, (xlator_t *)conf->private, local); + } + + if (subvol != avail_subvol) { + /* create a link file instead of actual file */ + local->params = dict_ref(params); + local->mode = mode; + local->flags = flags; + local->umask = umask; + local->cached_subvol = avail_subvol; + dht_linkfile_create(frame, nufa_create_linkfile_create_cbk, this, + avail_subvol, subvol, loc); + return 0; + } - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); + gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name); - conf = this->private; + STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol, + subvol->fops->create, loc, flags, mode, umask, fd, + params); - dht_get_du_info (frame, this, loc); + return 0; - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); - avail_subvol = conf->private; - if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { - avail_subvol = - dht_free_disk_available_subvol (this, - (xlator_t *)conf->private); - } + return 0; +} - if (subvol != avail_subvol) { - /* create a link file instead of actual file */ - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = ENOMEM; - goto err; - } - - local->fd = fd_ref (fd); - local->params = dict_ref (params); - local->mode = mode; - local->flags = flags; - - local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, - nufa_create_linkfile_create_cbk, - avail_subvol, subvol, loc); - return 0; - } +int +nufa_mknod_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); + local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + op_ret = -1; + goto err; + } - STACK_WIND (frame, dht_create_cbk, - subvol, subvol->fops->create, - loc, flags, mode, fd, params); + if (op_ret >= 0) { + STACK_WIND_COOKIE( + frame, dht_newfile_cbk, (void *)local->cached_subvol, + local->cached_subvol, local->cached_subvol->fops->mknod, + &local->loc, local->mode, local->rdev, local->umask, local->params); return 0; - + } err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + WIPE(postparent); + WIPE(preparent); - return 0; + DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, xdata); + return 0; } int -nufa_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, - struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) +nufa_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *params) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + xlator_t *avail_subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + conf = this->private; + + dht_get_du_info(frame, this, loc); + + local = dht_local_init(frame, loc, NULL, GF_FOP_MKNOD); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = dht_subvol_get_hashed(this, loc); + if (!subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + /* Consider the disksize in consideration */ + avail_subvol = conf->private; + if (dht_is_subvol_filled(this, (xlator_t *)conf->private)) { + avail_subvol = dht_free_disk_available_subvol( + this, (xlator_t *)conf->private, local); + } + + if (avail_subvol != subvol) { + /* Create linkfile first */ + + local->params = dict_ref(params); + local->mode = mode; + local->umask = umask; + local->rdev = rdev; + local->cached_subvol = avail_subvol; + + dht_linkfile_create(frame, nufa_mknod_linkfile_cbk, this, avail_subvol, + subvol, loc); + return 0; + } - local = frame->local; + gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name); - if (op_ret >= 0) { - STACK_WIND (frame, dht_newfile_cbk, - local->cached_subvol, - local->cached_subvol->fops->mknod, - &local->loc, local->mode, local->rdev, - local->params); + STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, params); - return 0; - } + return 0; - WIPE (postparent); - WIPE (preparent); +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - DHT_STACK_UNWIND (link, frame, op_ret, op_errno, - inode, stbuf, preparent, postparent); - return 0; + return 0; } - -int -nufa_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *params) +gf_boolean_t +same_first_part(char *str1, char term1, char *str2, char term2) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - xlator_t *subvol = NULL; - xlator_t *avail_subvol = NULL; - int op_errno = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); + gf_boolean_t ended1; + gf_boolean_t ended2; - conf = this->private; - - dht_get_du_info (frame, this, loc); - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } - - /* Consider the disksize in consideration */ - avail_subvol = conf->private; - if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { - avail_subvol = - dht_free_disk_available_subvol (this, - (xlator_t *)conf->private); + for (;;) { + ended1 = ((*str1 == '\0') || (*str1 == term1)); + ended2 = ((*str2 == '\0') || (*str2 == term2)); + if (ended1 && ended2) { + return _gf_true; } - - if (avail_subvol != subvol) { - /* Create linkfile first */ - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = ENOMEM; - goto err; - } - - local->params = dict_ref (params); - local->mode = mode; - local->rdev = rdev; - local->cached_subvol = avail_subvol; - - dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, - avail_subvol, subvol, loc); - return 0; + if (ended1 || ended2 || (*str1 != *str2)) { + return _gf_false; } + ++str1; + ++str2; + } +} - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); +typedef struct nufa_args { + xlator_t *this; + char *volname; + gf_boolean_t addr_match; +} nufa_args_t; - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, params); +static void +nufa_find_local_brick(xlator_t *xl, void *data) +{ + nufa_args_t *args = data; + xlator_t *this = args->this; + char *local_volname = args->volname; + gf_boolean_t addr_match = args->addr_match; + char *brick_host = NULL; + dht_conf_t *conf = this->private; + int ret = -1; + + /*This means a local subvol was already found. We pick the first brick + * that is local*/ + if (conf->private) + return; - return 0; + if (strcmp(xl->name, local_volname) == 0) { + conf->private = xl; + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "Using specified subvol %s", local_volname); + return; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + if (!addr_match) + return; - return 0; + ret = dict_get_str(xl->options, "remote-host", &brick_host); + if ((ret == 0) && (gf_is_same_address(local_volname, brick_host) || + gf_is_local_addr(brick_host))) { + conf->private = xl; + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "Using the first local " + "subvol %s", + xl->name); + return; + } } - -int -notify (xlator_t *this, int event, void *data, ...) +static void +nufa_to_dht(xlator_t *this) { - int ret = -1; + GF_ASSERT(this); + GF_ASSERT(this->fops); - ret = dht_notify (this, event, data); - - return ret; + this->fops->lookup = dht_lookup; + this->fops->create = dht_create; + this->fops->mknod = dht_mknod; } -void -fini (xlator_t *this) +int +nufa_find_local_subvol(xlator_t *this, void (*fn)(xlator_t *each, void *data), + void *data) { - int i = 0; - dht_conf_t *conf = NULL; - - conf = this->private; - - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->default_dir_layout) - GF_FREE (conf->default_dir_layout); - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - GF_FREE (conf); - } - - return; + int ret = -1; + dht_conf_t *conf = this->private; + xlator_list_t *trav = NULL; + xlator_t *parent = NULL; + xlator_t *candidate = NULL; + + xlator_foreach_depth_first(this, fn, data); + if (!conf->private) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_BRICK_ERROR, + "Couldn't find a local " + "brick"); + return -1; + } + + candidate = conf->private; + trav = candidate->parents; + while (trav) { + parent = trav->xlator; + if (strcmp(parent->type, "cluster/nufa") == 0) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "Found local subvol, " + "%s", + candidate->name); + ret = 0; + conf->private = candidate; + break; + } + + candidate = parent; + trav = parent->parents; + } + + return ret; } int -init (xlator_t *this) +nufa_init(xlator_t *this) { - dht_conf_t *conf = NULL; - xlator_list_t *trav = NULL; - data_t *data = NULL; - char *local_volname = NULL; - char *temp_str = NULL; - int ret = -1; - int i = 0; - char my_hostname[256]; - uint32_t temp_free_disk = 0; - - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "NUFA needs more than one subvolume"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } - - conf = GF_CALLOC (1, sizeof (*conf), - gf_dht_mt_dht_conf_t); - if (!conf) { - goto err; - } - - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean */ - if (strcasecmp (temp_str, "auto")) - gf_string2boolean (temp_str, &conf->search_unhashed); - else - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; - } - - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; - } - - LOCK_INIT (&conf->subvolume_lock); - LOCK_INIT (&conf->layout_lock); + data_t *data = NULL; + char *local_volname = NULL; + int ret = -1; + char my_hostname[256]; + gf_boolean_t addr_match = _gf_false; + nufa_args_t args = { + 0, + }; + + ret = dht_init(this); + if (ret) { + return ret; + } - conf->gen = 1; + if ((data = dict_get(this->options, "local-volume-name"))) { + local_volname = data->data; + } else { + addr_match = _gf_true; local_volname = "localhost"; - ret = gethostname (my_hostname, 256); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "could not find hostname (%s)", - strerror (errno)); - } - + ret = gethostname(my_hostname, 256); if (ret == 0) - local_volname = my_hostname; - - data = dict_get (this->options, "local-volume-name"); - if (data) { - local_volname = data->data; - } - - trav = this->children; - while (trav) { - if (strcmp (trav->xlator->name, local_volname) == 0) - break; - trav = trav->next; - } - - if (!trav) { - gf_log (this->name, GF_LOG_ERROR, - "Could not find subvolume named '%s'. " - "Please define volume with the name as the hostname " - "or override it with 'option local-volume-name'", - local_volname); - goto err; - } - /* The volume specified exists */ - conf->private = trav->xlator; - - conf->min_free_disk = 10; - conf->disk_unit = 'p'; - - if (dict_get_str (this->options, "min-free-disk", - &temp_str) == 0) { - if (gf_string2percent (temp_str, - &temp_free_disk) == 0) { - if (temp_free_disk > 100) { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } else { - conf->min_free_disk = (uint64_t)temp_free_disk; - conf->disk_unit = 'p'; - } - } else { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } - } - - conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), - gf_dht_mt_dht_du_t); - if (!conf->du_stats) { - goto err; - } - - this->private = conf; - - return 0; - -err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->default_dir_layout) - GF_FREE (conf->default_dir_layout); - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - if (conf->du_stats) - GF_FREE (conf->du_stats); - - GF_FREE (conf); - } - - return -1; + local_volname = my_hostname; + + else + gf_msg(this->name, GF_LOG_WARNING, errno, + DHT_MSG_GET_HOSTNAME_FAILED, "could not find hostname"); + } + + args.this = this; + args.volname = local_volname; + args.addr_match = addr_match; + ret = nufa_find_local_subvol(this, nufa_find_local_brick, &args); + if (ret) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "Unable to find local subvolume, switching " + "to dht mode"); + nufa_to_dht(this); + } + return 0; } - -struct xlator_fops fops = { - .lookup = nufa_lookup, - .create = nufa_create, - .mknod = nufa_mknod, - - .stat = dht_stat, - .fstat = dht_fstat, - .truncate = dht_truncate, - .ftruncate = dht_ftruncate, - .access = dht_access, - .readlink = dht_readlink, - .setxattr = dht_setxattr, - .getxattr = dht_getxattr, - .removexattr = dht_removexattr, - .open = dht_open, - .readv = dht_readv, - .writev = dht_writev, - .flush = dht_flush, - .fsync = dht_fsync, - .statfs = dht_statfs, - .lk = dht_lk, - .opendir = dht_opendir, - .readdir = dht_readdir, - .readdirp = dht_readdirp, - .fsyncdir = dht_fsyncdir, - .symlink = dht_symlink, - .unlink = dht_unlink, - .link = dht_link, - .mkdir = dht_mkdir, - .rmdir = dht_rmdir, - .rename = dht_rename, - .inodelk = dht_inodelk, - .finodelk = dht_finodelk, - .entrylk = dht_entrylk, - .fentrylk = dht_fentrylk, - .xattrop = dht_xattrop, - .fxattrop = dht_fxattrop, - .setattr = dht_setattr, +dht_methods_t dht_methods = { + .migration_get_dst_subvol = dht_migration_get_dst_subvol, + .layout_search = dht_layout_search, }; - -struct xlator_cbks cbks = { - .forget = dht_forget +struct xlator_fops fops = { + .lookup = nufa_lookup, + .create = nufa_create, + .mknod = nufa_mknod, + + .stat = dht_stat, + .fstat = dht_fstat, + .truncate = dht_truncate, + .ftruncate = dht_ftruncate, + .access = dht_access, + .readlink = dht_readlink, + .setxattr = dht_setxattr, + .getxattr = dht_getxattr, + .removexattr = dht_removexattr, + .open = dht_open, + .readv = dht_readv, + .writev = dht_writev, + .flush = dht_flush, + .fsync = dht_fsync, + .statfs = dht_statfs, + .lk = dht_lk, + .opendir = dht_opendir, + .readdir = dht_readdir, + .readdirp = dht_readdirp, + .fsyncdir = dht_fsyncdir, + .symlink = dht_symlink, + .unlink = dht_unlink, + .link = dht_link, + .mkdir = dht_mkdir, + .rmdir = dht_rmdir, + .rename = dht_rename, + .inodelk = dht_inodelk, + .finodelk = dht_finodelk, + .entrylk = dht_entrylk, + .fentrylk = dht_fentrylk, + .xattrop = dht_xattrop, + .fxattrop = dht_fxattrop, + .setattr = dht_setattr, }; - -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .value = {"auto", "yes", "no", "enable", "disable", "1", "0", - "on", "off"}, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"local-volume-name"}, - .type = GF_OPTION_TYPE_XLATOR - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - }, - { .key = {NULL} }, +struct xlator_cbks cbks = {.forget = dht_forget}; +extern int32_t +mem_acct_init(xlator_t *this); + +xlator_api_t xlator_api = { + .init = nufa_init, + .fini = dht_fini, + .notify = dht_notify, + .reconfigure = dht_reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = dht_options, + .identifier = "nufa", + .category = GF_TECH_PREVIEW, }; diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c index 20356e24eb4..207d109a025 100644 --- a/xlators/cluster/dht/src/switch.c +++ b/xlators/cluster/dht/src/switch.c @@ -1,29 +1,14 @@ /* - Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "dht-common.c" +#include "dht-common.h" #include "dht-mem-types.h" #include <sys/time.h> @@ -31,1020 +16,876 @@ #include <fnmatch.h> #include <string.h> +extern struct volume_options dht_options[]; + struct switch_sched_array { - xlator_t *xl; - int32_t eligible; - int32_t considered; + xlator_t *xl; + int32_t eligible; + int32_t considered; }; /* Select one of this struct based on the path's pattern match */ struct switch_struct { - struct switch_struct *next; - struct switch_sched_array *array; - int32_t node_index; /* Index of the node in - this pattern. */ - int32_t num_child; /* Total num of child nodes - with this pattern. */ - char path_pattern[256]; + struct switch_struct *next; + struct switch_sched_array *array; + int32_t node_index; /* Index of the node in + this pattern. */ + int32_t num_child; /* Total num of child nodes + with this pattern. */ + char path_pattern[256]; }; /* TODO: all 'TODO's in dht.c holds good */ /* This function should return child node as '*:subvolumes' is inserterd */ static int32_t -gf_switch_valid_child (xlator_t *this, const char *child) +gf_switch_valid_child(xlator_t *this, const char *child) { - xlator_list_t *children = NULL; - int32_t ret = 0; + xlator_list_t *children = NULL; + int32_t ret = 0; - children = this->children; - while (children) { - if (!strcmp (child, children->xlator->name)) { - ret = 1; - break; - } - children = children->next; + children = this->children; + while (children) { + if (!strcmp(child, children->xlator->name)) { + ret = 1; + break; } + children = children->next; + } - return ret; + return ret; } static xlator_t * -get_switch_matching_subvol (const char *path, dht_conf_t *conf, - xlator_t *hashed_subvol) +get_switch_matching_subvol(const char *path, dht_conf_t *conf, + xlator_t *hashed_subvol) { - struct switch_struct *cond = NULL; - struct switch_struct *trav = NULL; - char *pathname = NULL; - int idx = 0; - - cond = conf->private; - if (!cond) - return hashed_subvol; + struct switch_struct *cond = NULL; + struct switch_struct *trav = NULL; + char *pathname = NULL; + int idx = 0; + xlator_t *subvol = NULL; + + cond = conf->private; + subvol = hashed_subvol; + if (!cond) + goto out; + + pathname = gf_strdup(path); + if (!pathname) + goto out; + + trav = cond; + while (trav) { + if (fnmatch(trav->path_pattern, pathname, FNM_NOESCAPE) == 0) { + for (idx = 0; idx < trav->num_child; idx++) { + if (trav->array[idx].xl == hashed_subvol) + goto out; + } + idx = trav->node_index++; + trav->node_index %= trav->num_child; + subvol = trav->array[idx].xl; + goto out; + } + trav = trav->next; + } +out: + GF_FREE(pathname); - trav = cond; - pathname = gf_strdup (path); - while (trav) { - if (fnmatch (trav->path_pattern, - pathname, FNM_NOESCAPE) == 0) { - for (idx = 0; idx < trav->num_child; idx++) { - if (trav->array[idx].xl == hashed_subvol) - return hashed_subvol; - } - idx = trav->node_index++; - trav->node_index %= trav->num_child; - return trav->array[idx].xl; - } - trav = trav->next; - } - GF_FREE (pathname); - return hashed_subvol; + return subvol; } - int -switch_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) +switch_local_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - xlator_t *subvol = NULL; - char is_linkfile = 0; - char is_dir = 0; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - loc_t *loc = NULL; - int i = 0; - call_frame_t *prev = NULL; - int call_cnt = 0; - int ret = 0; - - conf = this->private; - - prev = cookie; - local = frame->local; - loc = &local->loc; - - if (ENTRY_MISSING (op_ret, op_errno)) { - if (conf->search_unhashed) { - local->op_errno = ENOENT; - dht_lookup_everywhere (frame, this, loc); - return 0; - } - } + xlator_t *subvol = NULL; + char is_linkfile = 0; + char is_dir = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; + int i = 0; + xlator_t *prev = NULL; + int call_cnt = 0; + int ret = 0; - if (op_ret == -1) - goto out; + conf = this->private; - is_linkfile = check_is_linkfile (inode, stbuf, xattr); - is_dir = check_is_dir (inode, stbuf, xattr); + prev = cookie; + local = frame->local; + loc = &local->loc; - if (!is_dir && !is_linkfile) { - /* non-directory and not a linkfile */ + if (ENTRY_MISSING(op_ret, op_errno)) { + if (conf->search_unhashed) { + local->op_errno = ENOENT; + dht_lookup_everywhere(frame, this, loc); + return 0; + } + } - dht_itransform (this, prev->this, stbuf->ia_ino, - &stbuf->ia_ino); + if (op_ret == -1) + goto out; - ret = dht_layout_preset (this, prev->this, inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not set pre-set layout for subvol %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto err; - } + is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name); + is_dir = check_is_dir(inode, stbuf, xattr); - goto out; + if (!is_dir && !is_linkfile) { + /* non-directory and not a linkfile */ + + ret = dht_layout_preset(this, prev, inode); + if (ret < 0) { + gf_msg_debug(this->name, 0, + "could not set pre-set layout " + "for subvol %s", + prev->name); + op_ret = -1; + op_errno = EINVAL; + goto err; } - if (is_dir) { - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; + goto out; + } - local->inode = inode_ref (inode); - local->xattr = dict_ref (xattr); + if (is_dir) { + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; - local->op_ret = 0; - local->op_errno = 0; + local->inode = inode_ref(inode); + local->xattr = dict_ref(xattr); - local->layout = dht_layout_new (this, conf->subvolume_cnt); - if (!local->layout) { - op_ret = -1; - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_DEBUG, - "memory allocation failed :("); - goto err; - } + local->op_ret = 0; + local->op_errno = 0; - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - &local->loc, local->xattr_req); - } + local->layout = dht_layout_new(this, conf->subvolume_cnt); + if (!local->layout) { + op_ret = -1; + op_errno = ENOMEM; + gf_msg_debug(this->name, 0, "memory allocation failed :("); + goto err; } - if (is_linkfile) { - subvol = dht_linkfile_subvol (this, inode, stbuf, xattr); + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, + local->xattr_req); + } + } - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "linkfile not having link subvolume. path=%s", - loc->path); - dht_lookup_everywhere (frame, this, loc); - return 0; - } + if (is_linkfile) { + subvol = dht_linkfile_subvol(this, inode, stbuf, xattr); - STACK_WIND (frame, dht_lookup_linkfile_cbk, - subvol, subvol->fops->lookup, - &local->loc, local->xattr_req); + if (!subvol) { + gf_msg_debug(this->name, 0, + "linkfile has no link subvolume.path=%s", loc->path); + dht_lookup_everywhere(frame, this, loc); + return 0; } - return 0; + STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol, + subvol->fops->lookup, &local->loc, local->xattr_req); + } + + return 0; out: - if (!local->hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - local->loc.path); - local->op_errno = ENOENT; - dht_lookup_everywhere (frame, this, loc); - return 0; - } + if (!local->hashed_subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + local->loc.path); + local->op_errno = ENOENT; + dht_lookup_everywhere(frame, this, loc); + return 0; + } - STACK_WIND (frame, dht_lookup_cbk, - local->hashed_subvol, local->hashed_subvol->fops->lookup, - &local->loc, local->xattr_req); + STACK_WIND_COOKIE(frame, dht_lookup_cbk, local->hashed_subvol, + local->hashed_subvol, local->hashed_subvol->fops->lookup, + &local->loc, local->xattr_req); - return 0; + return 0; err: - DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, - inode, stbuf, xattr, NULL); - return 0; + DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr, + NULL); + return 0; } int -switch_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) +switch_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xattr_req) { - xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; - xlator_t *subvol = NULL; - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - int op_errno = -1; - dht_layout_t *layout = NULL; - int i = 0; - int call_cnt = 0; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - conf = this->private; - - local = dht_local_init (frame); - if (!local) { + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + + conf = this->private; + + local = dht_local_init(frame, loc, NULL, GF_FOP_LOOKUP); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + if (xattr_req) { + local->xattr_req = dict_ref(xattr_req); + } else { + local->xattr_req = dict_new(); + } + + hashed_subvol = dht_subvol_get_hashed(this, &local->loc); + cached_subvol = local->cached_subvol; + + local->hashed_subvol = hashed_subvol; + + if (is_revalidate(loc)) { + layout = local->layout; + if (!layout) { + gf_msg_debug(this->name, 0, + "revalidate lookup without cache. path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (layout->gen && (layout->gen < conf->gen)) { + gf_msg_debug(this->name, 0, "incomplete layout failure for path=%s", + loc->path); + dht_layout_unref(this, local->layout); + goto do_fresh_lookup; + } + + local->inode = inode_ref(loc->inode); + + local->call_cnt = layout->cnt; + call_cnt = local->call_cnt; + + /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' + * attribute, revalidates directly go to the cached-subvolume. + */ + ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4); + if (ret < 0) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "failed to set dict value for %s", conf->xattr_name); + + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND_COOKIE(frame, dht_revalidate_cbk, subvol, subvol, + subvol->fops->lookup, loc, local->xattr_req); + + if (!--call_cnt) + break; + } + } else { + do_fresh_lookup: + ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4); + if (ret < 0) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "failed to set dict value for %s", conf->xattr_name); + + ret = dict_set_uint32(local->xattr_req, conf->link_xattr_name, 256); + if (ret < 0) + gf_msg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_DICT_SET_FAILED, + "failed to set dict value for %s", conf->link_xattr_name); + + if (!hashed_subvol) { + gf_msg_debug(this->name, 0, + "no subvolume in layout for path=%s, " + "checking on all the subvols to see if " + "it is a directory", + loc->path); + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new(this, conf->subvolume_cnt); + if (!local->layout) { op_errno = ENOMEM; goto err; - } - - ret = loc_dup (loc, &local->loc); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "copying location failed for path=%s", - loc->path); - goto err; - } - - if (xattr_req) { - local->xattr_req = dict_ref (xattr_req); - } else { - local->xattr_req = dict_new (); - } - - hashed_subvol = dht_subvol_get_hashed (this, &local->loc); - cached_subvol = dht_subvol_get_cached (this, local->loc.inode); - - local->cached_subvol = cached_subvol; - local->hashed_subvol = hashed_subvol; - - if (is_revalidate (loc)) { - local->layout = layout = dht_layout_get (this, loc->inode); - - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "revalidate without cache. path=%s", - loc->path); - op_errno = EINVAL; - goto err; - } - - if (layout->gen && (layout->gen < conf->gen)) { - gf_log (this->name, GF_LOG_DEBUG, - "incomplete layout failure for path=%s", - loc->path); - dht_layout_unref (this, local->layout); - goto do_fresh_lookup; - } - - local->inode = inode_ref (loc->inode); - local->ia_ino = loc->inode->ino; - - local->call_cnt = layout->cnt; - call_cnt = local->call_cnt; - - /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' - * attribute, revalidates directly go to the cached-subvolume. - */ - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for " - "trusted.glusterfs.dht"); - - for (i = 0; i < layout->cnt; i++) { - subvol = layout->list[i].xlator; - - STACK_WIND (frame, dht_revalidate_cbk, - subvol, subvol->fops->lookup, - loc, local->xattr_req); - - if (!--call_cnt) - break; - } + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, + conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); + } + return 0; + } + + /* */ + cached_subvol = get_switch_matching_subvol(loc->path, conf, + hashed_subvol); + if (cached_subvol == hashed_subvol) { + STACK_WIND_COOKIE(frame, dht_lookup_cbk, hashed_subvol, + hashed_subvol, hashed_subvol->fops->lookup, loc, + local->xattr_req); } else { - do_fresh_lookup: - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for " - "trusted.glusterfs.dht"); - - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht.linkto", 256); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for " - "trusted.glusterfs.dht.linkto"); - - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s, " - "checking on all the subvols to see if " - "it is a directory", loc->path); - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; - - local->layout = dht_layout_new (this, - conf->subvolume_cnt); - if (!local->layout) { - op_errno = ENOMEM; - goto err; - } - - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - &local->loc, local->xattr_req); - } - return 0; - } - - /* */ - cached_subvol = get_switch_matching_subvol (loc->path, conf, - hashed_subvol); - if (cached_subvol == hashed_subvol) { - STACK_WIND (frame, dht_lookup_cbk, - hashed_subvol, - hashed_subvol->fops->lookup, - loc, local->xattr_req); - } else { - STACK_WIND (frame, switch_local_lookup_cbk, - cached_subvol, - cached_subvol->fops->lookup, - loc, local->xattr_req); - } + STACK_WIND_COOKIE(frame, switch_local_lookup_cbk, cached_subvol, + cached_subvol, cached_subvol->fops->lookup, loc, + local->xattr_req); } + } - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); - return 0; + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; } int -switch_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, - struct iatt *postparent) +switch_create_linkfile_create_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret == -1) - goto err; + if (op_ret == -1) + goto err; - STACK_WIND (frame, dht_create_cbk, - local->cached_subvol, local->cached_subvol->fops->create, - &local->loc, local->flags, local->mode, local->fd, - local->params); + STACK_WIND_COOKIE(frame, dht_create_cbk, local->cached_subvol, + local->cached_subvol, local->cached_subvol->fops->create, + &local->loc, local->flags, local->mode, local->umask, + local->fd, local->params); - return 0; + return 0; err: - DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - return 0; + DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; } int -switch_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) +switch_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *params) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - xlator_t *subvol = NULL; - xlator_t *avail_subvol = NULL; - int op_errno = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - - conf = this->private; - - dht_get_du_info (frame, this, loc); - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } - - avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); - if (dht_is_subvol_filled (this, avail_subvol)) { - avail_subvol = - dht_free_disk_available_subvol (this, avail_subvol); - } - - if (subvol != avail_subvol) { - /* create a link file instead of actual file */ - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = ENOMEM; - goto err; - } - - local->fd = fd_ref (fd); - local->mode = mode; - local->flags = flags; - - local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, - switch_create_linkfile_create_cbk, - avail_subvol, subvol, loc); - return 0; - } + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + xlator_t *avail_subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + conf = this->private; + + dht_get_du_info(frame, this, loc); + + local = dht_local_init(frame, loc, fd, GF_FOP_CREATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = dht_subvol_get_hashed(this, loc); + if (!subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + avail_subvol = get_switch_matching_subvol(loc->path, conf, subvol); + if (dht_is_subvol_filled(this, avail_subvol)) { + avail_subvol = dht_free_disk_available_subvol(this, avail_subvol, + local); + } + + if (subvol != avail_subvol) { + /* create a link file instead of actual file */ + local->mode = mode; + local->flags = flags; + local->umask = umask; + local->cached_subvol = avail_subvol; + dht_linkfile_create(frame, switch_create_linkfile_create_cbk, this, + avail_subvol, subvol, loc); + return 0; + } - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); + gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_create_cbk, - subvol, subvol->fops->create, - loc, flags, mode, fd, params); + STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol, + subvol->fops->create, loc, flags, mode, umask, fd, + params); - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); - return 0; + return 0; } int -switch_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, - struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) +switch_mknod_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; - local = frame->local; + local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + op_ret = -1; + goto err; + } - if (op_ret >= 0) { - STACK_WIND (frame, dht_newfile_cbk, - local->cached_subvol, - local->cached_subvol->fops->mknod, - &local->loc, local->mode, local->rdev, - local->params); + if (op_ret >= 0) { + STACK_WIND_COOKIE( + frame, dht_newfile_cbk, (void *)local->cached_subvol, + local->cached_subvol, local->cached_subvol->fops->mknod, + &local->loc, local->mode, local->rdev, local->umask, local->params); - return 0; - } - - DHT_STACK_UNWIND (link, frame, op_ret, op_errno, - inode, stbuf, preparent, postparent); return 0; + } +err: + DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, xdata); + return 0; } - int -switch_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *params) +switch_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *params) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - xlator_t *subvol = NULL; - xlator_t *avail_subvol = NULL; - int op_errno = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - - conf = this->private; - - dht_get_du_info (frame, this, loc); - - local = dht_local_init (frame); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } - - /* Consider the disksize in consideration */ - avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); - if (dht_is_subvol_filled (this, avail_subvol)) { - avail_subvol = - dht_free_disk_available_subvol (this, avail_subvol); - } - - if (avail_subvol != subvol) { - /* Create linkfile first */ - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = ENOMEM; - goto err; - } - - local->params = dict_ref (params); - local->mode = mode; - local->rdev = rdev; - local->cached_subvol = avail_subvol; - - dht_linkfile_create (frame, switch_mknod_linkfile_cbk, - avail_subvol, subvol, loc); - return 0; - } - - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); - - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, params); - + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + xlator_t *avail_subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + conf = this->private; + + dht_get_du_info(frame, this, loc); + + local = dht_local_init(frame, loc, NULL, GF_FOP_MKNOD); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = dht_subvol_get_hashed(this, loc); + if (!subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + /* Consider the disksize in consideration */ + avail_subvol = get_switch_matching_subvol(loc->path, conf, subvol); + if (dht_is_subvol_filled(this, avail_subvol)) { + avail_subvol = dht_free_disk_available_subvol(this, avail_subvol, + local); + } + + if (avail_subvol != subvol) { + /* Create linkfile first */ + + local->params = dict_ref(params); + local->mode = mode; + local->umask = umask; + local->rdev = rdev; + local->cached_subvol = avail_subvol; + + dht_linkfile_create(frame, switch_mknod_linkfile_cbk, this, + avail_subvol, subvol, loc); return 0; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name); - return 0; -} + STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, params); + return 0; -int -notify (xlator_t *this, int event, void *data, ...) -{ - int ret = -1; - - ret = dht_notify (this, event, data); +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - return ret; + return 0; } void -fini (xlator_t *this) +switch_fini(xlator_t *this) { - int i = 0; - dht_conf_t *conf = NULL; - struct switch_struct *trav = NULL; - struct switch_struct *prev = NULL; - - conf = this->private; - - if (conf) { - trav = (struct switch_struct *)conf->private; - conf->private = NULL; - while (trav) { - if (trav->array) - GF_FREE (trav->array); - prev = trav; - trav = trav->next; - GF_FREE (prev); - } + dht_conf_t *conf = NULL; + struct switch_struct *trav = NULL; + struct switch_struct *prev = NULL; - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->default_dir_layout) - GF_FREE (conf->default_dir_layout); - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); + conf = this->private; - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - GF_FREE (conf); + if (conf) { + trav = (struct switch_struct *)conf->private; + conf->private = NULL; + while (trav) { + GF_FREE(trav->array); + prev = trav; + trav = trav->next; + GF_FREE(prev); } + } - return; + dht_fini(this); } int -set_switch_pattern (xlator_t *this, dht_conf_t *conf, - const char *pattern_str) +set_switch_pattern(xlator_t *this, dht_conf_t *conf, const char *pattern_str) { - int flag = 0; - int idx = 0; - int index = 0; - int child_count = 0; - char *tmp = NULL; - char *tmp1 = NULL; - char *child = NULL; - char *tmp_str = NULL; - char *tmp_str1 = NULL; - char *dup_str = NULL; - char *dup_childs = NULL; - char *switch_str = NULL; - char *pattern = NULL; - char *childs = NULL; - char *option_string = NULL; - struct switch_struct *switch_buf = NULL; - struct switch_struct *switch_opt = NULL; - struct switch_struct *trav = NULL; - struct switch_sched_array *switch_buf_array = NULL; - xlator_list_t *trav_xl = NULL; - - trav_xl = this->children; - while (trav_xl) { - index++; - trav_xl = trav_xl->next; - } - child_count = index; - switch_buf_array = GF_CALLOC ((index + 1), - sizeof (struct switch_sched_array), - gf_switch_mt_switch_sched_array); - if (!switch_buf_array) + int flag = 0; + int idx = 0; + int index = 0; + int child_count = 0; + char *tmp = NULL; + char *tmp1 = NULL; + char *child = NULL; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *dup_childs = NULL; + char *switch_str = NULL; + char *pattern = NULL; + char *childs = NULL; + char *option_string = NULL; + size_t pattern_length; + struct switch_struct *switch_buf = NULL; + struct switch_struct *switch_opt = NULL; + struct switch_struct *trav = NULL; + struct switch_sched_array *switch_buf_array = NULL; + xlator_list_t *trav_xl = NULL; + + trav_xl = this->children; + while (trav_xl) { + index++; + trav_xl = trav_xl->next; + } + child_count = index; + switch_buf_array = GF_CALLOC((index + 1), sizeof(struct switch_sched_array), + gf_switch_mt_switch_sched_array); + if (!switch_buf_array) + goto err; + + trav_xl = this->children; + index = 0; + + while (trav_xl) { + switch_buf_array[index].xl = trav_xl->xlator; + switch_buf_array[index].eligible = 1; + trav_xl = trav_xl->next; + index++; + } + + /* *jpg:child1,child2;*mpg:child3;*:child4,child5,child6 */ + + /* Get the pattern for considering switch case. + "option block-size *avi:10MB" etc */ + option_string = gf_strdup(pattern_str); + if (option_string == NULL) { + goto err; + } + switch_str = strtok_r(option_string, ";", &tmp_str); + while (switch_str) { + dup_str = gf_strdup(switch_str); + if (dup_str == NULL) { + goto err; + } + switch_opt = GF_CALLOC(1, sizeof(struct switch_struct), + gf_switch_mt_switch_struct); + if (!switch_opt) { + GF_FREE(dup_str); + goto err; + } + + pattern = strtok_r(dup_str, ":", &tmp_str1); + childs = strtok_r(NULL, ":", &tmp_str1); + if (strncmp(pattern, "*", 2) == 0) { + gf_msg("switch", GF_LOG_INFO, 0, DHT_MSG_SWITCH_PATTERN_INFO, + "'*' pattern will be taken by default " + "for all the unconfigured child nodes," + " hence neglecting current option"); + switch_str = strtok_r(NULL, ";", &tmp_str); + GF_FREE(switch_opt); + switch_opt = NULL; + GF_FREE(dup_str); + continue; + } + GF_FREE(dup_str); + + pattern_length = strlen(pattern); + if (pattern_length >= (sizeof(switch_opt->path_pattern))) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_SET_SWITCH_PATTERN_ERROR, "Pattern (%s) too long", + pattern); + goto err; + } + memcpy(switch_opt->path_pattern, pattern, pattern_length); + switch_opt->path_pattern[pattern_length] = '\0'; + + if (childs) { + dup_childs = gf_strdup(childs); + if (dup_childs == NULL) { goto err; - - trav_xl = this->children; - index = 0; - - while (trav_xl) { - switch_buf_array[index].xl = trav_xl->xlator; - switch_buf_array[index].eligible = 1; - trav_xl = trav_xl->next; - index++; - } - - /* *jpg:child1,child2;*mpg:child3;*:child4,child5,child6 */ - - /* Get the pattern for considering switch case. - "option block-size *avi:10MB" etc */ - option_string = gf_strdup (pattern_str); - switch_str = strtok_r (option_string, ";", &tmp_str); - while (switch_str) { - dup_str = gf_strdup (switch_str); - switch_opt = GF_CALLOC (1, sizeof (struct switch_struct), - gf_switch_mt_switch_struct); - if (!switch_opt) - goto err; - - pattern = strtok_r (dup_str, ":", &tmp_str1); - childs = strtok_r (NULL, ":", &tmp_str1); - if (strncmp (pattern, "*", 2) == 0) { - gf_log ("switch", GF_LOG_NORMAL, - "'*' pattern will be taken by default " - "for all the unconfigured child nodes," - " hence neglecting current option"); - switch_str = strtok_r (NULL, ";", &tmp_str); - GF_FREE (dup_str); - continue; - } - memcpy (switch_opt->path_pattern, pattern, strlen (pattern)); - if (childs) { - dup_childs = gf_strdup (childs); - child = strtok_r (dup_childs, ",", &tmp); - while (child) { - if (gf_switch_valid_child (this, child)) { - idx++; - child = strtok_r (NULL, ",", &tmp); - } else { - gf_log (this->name, GF_LOG_ERROR, - "%s is not a subvolume of %s. " - "pattern can only be scheduled " - "only to a subvolume of %s", - child, this->name, this->name); - goto err; - } - } - GF_FREE (dup_childs); - child = strtok_r (childs, ",", &tmp1); - switch_opt->num_child = idx; - switch_opt->array = GF_CALLOC (1, (idx * - sizeof (struct switch_sched_array)), - gf_switch_mt_switch_sched_array); - if (!switch_opt->array) - goto err; - idx = 0; - while (child) { - for (index = 0; index < child_count; index++) { - if (strcmp (switch_buf_array[index].xl->name, - child) == 0) { - gf_log ("switch", GF_LOG_DEBUG, - "'%s' pattern will be " - "scheduled to \"%s\"", - switch_opt->path_pattern, child); - /* - if (switch_buf_array[index-1].considered) { - gf_log ("switch", GF_LOG_DEBUG, - "ambiguity found, exiting"); - return -1; - } - */ - switch_opt->array[idx].xl = switch_buf_array[index].xl; - switch_buf_array[index].considered = 1; - idx++; - break; - } - } - child = strtok_r (NULL, ",", &tmp1); - } + } + child = strtok_r(dup_childs, ",", &tmp); + while (child) { + if (gf_switch_valid_child(this, child)) { + idx++; + child = strtok_r(NULL, ",", &tmp); } else { - /* error */ - gf_log ("switch", GF_LOG_ERROR, - "Check \"scheduler.switch.case\" " - "option in unify volume. Exiting"); - goto err; - } - GF_FREE (dup_str); - - /* Link it to the main structure */ - if (switch_buf) { - /* there are already few entries */ - trav = switch_buf; - while (trav->next) - trav = trav->next; - trav->next = switch_opt; - } else { - /* First entry */ - switch_buf = switch_opt; - } - switch_str = strtok_r (NULL, ";", &tmp_str); - } - - /* Now, all the pattern based considerations done, so for all the - * remaining pattern, '*' to all the remaining child nodes - */ - { - for (index=0; index < child_count; index++) { - /* check for considered flag */ - if (switch_buf_array[index].considered) - continue; - flag++; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SUBVOL_ERROR, + "%s is not a subvolume of %s. " + "pattern can only be scheduled " + "only to a subvolume of %s", + child, this->name, this->name); + GF_FREE(dup_childs); + goto err; } - if (!flag) { - gf_log ("switch", GF_LOG_ERROR, - "No nodes left for pattern '*'. Exiting"); - goto err; - } - switch_opt = GF_CALLOC (1, sizeof (struct switch_struct), - gf_switch_mt_switch_struct); - if (!switch_opt) - goto err; - - /* Add the '*' pattern to the array */ - memcpy (switch_opt->path_pattern, "*", 2); - switch_opt->num_child = flag; - switch_opt->array = - GF_CALLOC (1, - flag * sizeof (struct switch_sched_array), - gf_switch_mt_switch_sched_array); - if (!switch_opt->array) - goto err; - flag = 0; - for (index=0; index < child_count; index++) { - /* check for considered flag */ - if (switch_buf_array[index].considered) - continue; - gf_log ("switch", GF_LOG_DEBUG, - "'%s' pattern will be scheduled to \"%s\"", - switch_opt->path_pattern, - switch_buf_array[index].xl->name); - switch_opt->array[flag].xl = - switch_buf_array[index].xl; + } + GF_FREE(dup_childs); + child = strtok_r(childs, ",", &tmp1); + switch_opt->num_child = idx; + switch_opt->array = GF_CALLOC( + 1, (idx * sizeof(struct switch_sched_array)), + gf_switch_mt_switch_sched_array); + if (!switch_opt->array) + goto err; + idx = 0; + while (child) { + for (index = 0; index < child_count; index++) { + if (strcmp(switch_buf_array[index].xl->name, child) == 0) { + gf_msg_debug("switch", 0, + "'%s' pattern will be " + "scheduled to \"%s\"", + switch_opt->path_pattern, child); + /* + if (switch_buf_array[index-1].considered) { + gf_msg_debug ("switch", 0, + "ambiguity found, exiting"); + return -1; + } + */ + switch_opt->array[idx].xl = switch_buf_array[index].xl; switch_buf_array[index].considered = 1; - flag++; - } - if (switch_buf) { - /* there are already few entries */ - trav = switch_buf; - while (trav->next) - trav = trav->next; - trav->next = switch_opt; - } else { - /* First entry */ - switch_buf = switch_opt; + idx++; + break; + } } + child = strtok_r(NULL, ",", &tmp1); + } + } else { + /* error */ + gf_msg("switch", GF_LOG_ERROR, 0, DHT_MSG_SET_SWITCH_PATTERN_ERROR, + "Check \"scheduler.switch.case\" " + "option in unify volume. Exiting"); + goto err; } - /* */ - conf->private = switch_buf; - return 0; -err: + /* Link it to the main structure */ if (switch_buf) { - if (switch_buf_array) - GF_FREE (switch_buf_array); - trav = switch_buf; - while (trav) { - if (trav->array) - GF_FREE (trav->array); - switch_opt = trav; - trav = trav->next; - GF_FREE (switch_opt); - } - } - return -1; -} - - -int -init (xlator_t *this) -{ - dht_conf_t *conf = NULL; - data_t *data = NULL; - char *temp_str = NULL; - int ret = -1; - int i = 0; - uint32_t temp_free_disk = 0; - - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "SWITCH needs more than one subvolume"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } - - conf = GF_CALLOC (1, sizeof (*conf), gf_switch_mt_dht_conf_t); - if (!conf) { - goto err; - } - - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean */ - if (strcasecmp (temp_str, "auto")) - gf_string2boolean (temp_str, &conf->search_unhashed); - else - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - - conf->unhashed_sticky_bit = 0; - if (dict_get_str (this->options, "unhashed-sticky-bit", - &temp_str) == 0) { - gf_string2boolean (temp_str, &conf->unhashed_sticky_bit); - } - - conf->min_free_disk = 10; - conf->disk_unit = 'p'; - - if (dict_get_str (this->options, "min-free-disk", - &temp_str) == 0) { - if (gf_string2percent (temp_str, - &temp_free_disk) == 0) { - if (temp_free_disk > 100) { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } else { - conf->min_free_disk = (uint64_t)temp_free_disk; - conf->disk_unit = 'p'; - } - } else { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } + /* there are already few entries */ + trav = switch_buf; + while (trav->next) + trav = trav->next; + trav->next = switch_opt; + } else { + /* First entry */ + switch_buf = switch_opt; + } + switch_opt = NULL; + switch_str = strtok_r(NULL, ";", &tmp_str); + } + + /* Now, all the pattern based considerations done, so for all the + * remaining pattern, '*' to all the remaining child nodes + */ + { + for (index = 0; index < child_count; index++) { + /* check for considered flag */ + if (switch_buf_array[index].considered) + continue; + flag++; + } + if (!flag) { + gf_msg("switch", GF_LOG_ERROR, 0, DHT_MSG_SET_SWITCH_PATTERN_ERROR, + "No nodes left for pattern '*'. Exiting"); + goto err; + } + switch_opt = GF_CALLOC(1, sizeof(struct switch_struct), + gf_switch_mt_switch_struct); + if (!switch_opt) + goto err; + + /* Add the '*' pattern to the array */ + memcpy(switch_opt->path_pattern, "*", 2); + switch_opt->num_child = flag; + switch_opt->array = GF_CALLOC(1, + flag * sizeof(struct switch_sched_array), + gf_switch_mt_switch_sched_array); + if (!switch_opt->array) + goto err; + flag = 0; + for (index = 0; index < child_count; index++) { + /* check for considered flag */ + if (switch_buf_array[index].considered) + continue; + gf_msg_debug("switch", 0, + "'%s'" + " pattern will be scheduled to \"%s\"", + switch_opt->path_pattern, + switch_buf_array[index].xl->name); + + switch_opt->array[flag].xl = switch_buf_array[index].xl; + switch_buf_array[index].considered = 1; + flag++; } - - data = dict_get (this->options, "pattern.switch.case"); - if (data) { - /* TODO: */ - ret = set_switch_pattern (this, conf, data->data); - if (ret) { - goto err; - } + if (switch_buf) { + /* there are already few entries */ + trav = switch_buf; + while (trav->next) + trav = trav->next; + trav->next = switch_opt; + } else { + /* First entry */ + switch_buf = switch_opt; } + switch_opt = NULL; + } + /* */ + conf->private = switch_buf; - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; - } + GF_FREE(option_string); + return 0; +err: + GF_FREE(switch_buf_array); + GF_FREE(switch_opt); + GF_FREE(option_string); - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; + if (switch_buf) { + trav = switch_buf; + while (trav) { + GF_FREE(trav->array); + switch_opt = trav; + trav = trav->next; + GF_FREE(switch_opt); } + } + return -1; +} - LOCK_INIT (&conf->subvolume_lock); - LOCK_INIT (&conf->layout_lock); +int32_t +switch_init(xlator_t *this) +{ + dht_conf_t *conf = NULL; + data_t *data = NULL; + int ret = -1; - conf->gen = 1; + ret = dht_init(this); + if (ret) { + return ret; + } + conf = this->private; - conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), - gf_switch_mt_dht_du_t); - if (!conf->du_stats) { - goto err; + data = dict_get(this->options, "pattern.switch.case"); + if (data) { + /* TODO: */ + ret = set_switch_pattern(this, conf, data->data); + if (ret) { + goto err; } + } - this->private = conf; - - return 0; + this->private = conf; + return 0; err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->default_dir_layout) - GF_FREE (conf->default_dir_layout); - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - if (conf->du_stats) - GF_FREE (conf->du_stats); - - GF_FREE (conf); - } - - return -1; + dht_fini(this); + return -1; } - struct xlator_fops fops = { - .lookup = switch_lookup, - .create = switch_create, - .mknod = switch_mknod, - - .stat = dht_stat, - .fstat = dht_fstat, - .truncate = dht_truncate, - .ftruncate = dht_ftruncate, - .access = dht_access, - .readlink = dht_readlink, - .setxattr = dht_setxattr, - .getxattr = dht_getxattr, - .removexattr = dht_removexattr, - .open = dht_open, - .readv = dht_readv, - .writev = dht_writev, - .flush = dht_flush, - .fsync = dht_fsync, - .statfs = dht_statfs, - .lk = dht_lk, - .opendir = dht_opendir, - .readdir = dht_readdir, - .readdirp = dht_readdirp, - .fsyncdir = dht_fsyncdir, - .symlink = dht_symlink, - .unlink = dht_unlink, - .link = dht_link, - .mkdir = dht_mkdir, - .rmdir = dht_rmdir, - .rename = dht_rename, - .inodelk = dht_inodelk, - .finodelk = dht_finodelk, - .entrylk = dht_entrylk, - .fentrylk = dht_fentrylk, - .xattrop = dht_xattrop, - .fxattrop = dht_fxattrop, - .setattr = dht_setattr, -}; - - -struct xlator_cbks cbks = { - .forget = dht_forget + .lookup = switch_lookup, + .create = switch_create, + .mknod = switch_mknod, + + .stat = dht_stat, + .fstat = dht_fstat, + .truncate = dht_truncate, + .ftruncate = dht_ftruncate, + .access = dht_access, + .readlink = dht_readlink, + .setxattr = dht_setxattr, + .getxattr = dht_getxattr, + .removexattr = dht_removexattr, + .open = dht_open, + .readv = dht_readv, + .writev = dht_writev, + .flush = dht_flush, + .fsync = dht_fsync, + .statfs = dht_statfs, + .lk = dht_lk, + .opendir = dht_opendir, + .readdir = dht_readdir, + .readdirp = dht_readdirp, + .fsyncdir = dht_fsyncdir, + .symlink = dht_symlink, + .unlink = dht_unlink, + .link = dht_link, + .mkdir = dht_mkdir, + .rmdir = dht_rmdir, + .rename = dht_rename, + .inodelk = dht_inodelk, + .finodelk = dht_finodelk, + .entrylk = dht_entrylk, + .fentrylk = dht_fentrylk, + .xattrop = dht_xattrop, + .fxattrop = dht_fxattrop, + .setattr = dht_setattr, }; - -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .value = {"auto", "yes", "no", "enable", "disable", "1", "0", - "on", "off"}, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"pattern.switch.case"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - }, - { .key = {NULL} }, +struct xlator_cbks cbks = {.forget = dht_forget}; +extern int32_t +mem_acct_init(xlator_t *this); + +xlator_api_t xlator_api = { + .init = switch_init, + .fini = switch_fini, + .notify = dht_notify, + .reconfigure = dht_reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = dht_options, + .identifier = "switch", + .category = GF_TECH_PREVIEW, }; diff --git a/xlators/cluster/dht/src/unittest/dht_layout_mock.c b/xlators/cluster/dht/src/unittest/dht_layout_mock.c new file mode 100644 index 00000000000..771452963d1 --- /dev/null +++ b/xlators/cluster/dht/src/unittest/dht_layout_mock.c @@ -0,0 +1,73 @@ +/* + Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#include <glusterfs/glusterfs.h> +#include <glusterfs/xlator.h> +#include "dht-common.h" +#include <glusterfs/byte-order.h> + +int +dht_hash_compute(xlator_t *this, int type, const char *name, uint32_t *hash_p) +{ + return 0; +} + +int +dht_inode_ctx_layout_get(inode_t *inode, xlator_t *this, dht_layout_t **layout) +{ + return 0; +} + +int +dht_inode_ctx_layout_set(inode_t *inode, xlator_t *this, + dht_layout_t *layout_int) +{ + return 0; +} + +int +dict_get_ptr(dict_t *this, char *key, void **ptr) +{ + return 0; +} + +int +dict_get_ptr_and_len(dict_t *this, char *key, void **ptr, int *len) +{ + return 0; +} + +int +_gf_log(const char *domain, const char *file, const char *function, + int32_t line, gf_loglevel_t level, const char *fmt, ...) +{ + return 0; +} + +int +_gf_log_callingfn(const char *domain, const char *file, const char *function, + int32_t line, gf_loglevel_t level, const char *fmt, ...) +{ + return 0; +} + +void +gf_uuid_unparse(const uuid_t uu, char *out) +{ + // could call a will-return function here + // to place the correct data in *out +} + +int +_gf_msg(const char *domain, const char *file, const char *function, + int32_t line, gf_loglevel_t level, int errnum, int trace, + uint64_t msgid, const char *fmt, ...) +{ + return 0; +} diff --git a/xlators/cluster/dht/src/unittest/dht_layout_unittest.c b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c new file mode 100644 index 00000000000..c94a1d0a2e1 --- /dev/null +++ b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c @@ -0,0 +1,127 @@ +/* + Copyright (c) 2008-2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "dht-common.h" +#include <glusterfs/logging.h> +#include <glusterfs/xlator.h> + +#include <inttypes.h> +#include <stdarg.h> +#include <stddef.h> +#include <setjmp.h> +#include <cmocka_pbc.h> +#include <cmocka.h> + +/* + * Helper functions + */ + +static xlator_t * +helper_xlator_init(uint32_t num_types) +{ + xlator_t *xl; + int i, ret; + + REQUIRE(num_types > 0); + + xl = test_calloc(1, sizeof(xlator_t)); + assert_non_null(xl); + xl->mem_acct->num_types = num_types; + xl->mem_acct = test_calloc(sizeof(struct mem_acct) + + sizeof(struct mem_acct_rec) + num_types); + assert_non_null(xl->mem_acct); + + xl->ctx = test_calloc(1, sizeof(glusterfs_ctx_t)); + assert_non_null(xl->ctx); + + for (i = 0; i < num_types; i++) { + ret = LOCK_INIT(&(xl->mem_acct.rec[i].lock)); + assert_false(ret); + } + + ENSURE(num_types == xl->mem_acct.num_types); + ENSURE(NULL != xl); + + return xl; +} + +static int +helper_xlator_destroy(xlator_t *xl) +{ + int i, ret; + + for (i = 0; i < xl->mem_acct.num_types; i++) { + ret = LOCK_DESTROY(&(xl->mem_acct.rec[i].lock)); + assert_int_equal(ret, 0); + } + + free(xl->mem_acct.rec); + free(xl->ctx); + free(xl); + return 0; +} + +/* + * Unit tests + */ +static void +test_dht_layout_new(void **state) +{ + xlator_t *xl; + dht_layout_t *layout; + dht_conf_t *conf; + int cnt; + + expect_assert_failure(dht_layout_new(NULL, 0)); + expect_assert_failure(dht_layout_new((xlator_t *)0x12345, -1)); + xl = helper_xlator_init(10); + + // xl->private is NULL + assert_null(xl->private); + cnt = 100; + layout = dht_layout_new(xl, cnt); + assert_non_null(layout); + assert_int_equal(layout->type, DHT_HASH_TYPE_DM); + assert_int_equal(layout->cnt, cnt); + assert_int_equal(GF_ATOMIC_GET(layout->ref), 1); + assert_int_equal(layout->gen, 0); + assert_int_equal(layout->spread_cnt, 0); + free(layout); + + // xl->private is not NULL + cnt = 110; + conf = (dht_conf_t *)test_calloc(1, sizeof(dht_conf_t)); + assert_non_null(conf); + conf->dir_spread_cnt = 12345; + conf->gen = -123; + xl->private = conf; + + layout = dht_layout_new(xl, cnt); + assert_non_null(layout); + assert_int_equal(layout->type, DHT_HASH_TYPE_DM); + assert_int_equal(layout->cnt, cnt); + assert_int_equal(GF_ATOMIC_GET(layout->ref), 1); + assert_int_equal(layout->gen, conf->gen); + assert_int_equal(layout->spread_cnt, conf->dir_spread_cnt); + free(layout); + + free(conf); + helper_xlator_destroy(xl); +} + +int +main(void) +{ + const struct CMUnitTest xlator_dht_layout_tests[] = { + unit_test(test_dht_layout_new), + }; + + return cmocka_run_group_tests(xlator_dht_layout_tests, NULL, NULL); +} diff --git a/xlators/cluster/ha/Makefile.am b/xlators/cluster/ec/Makefile.am index d471a3f9243..d471a3f9243 100644 --- a/xlators/cluster/ha/Makefile.am +++ b/xlators/cluster/ec/Makefile.am diff --git a/xlators/cluster/ec/src/Makefile.am b/xlators/cluster/ec/src/Makefile.am new file mode 100644 index 00000000000..406a636bbc2 --- /dev/null +++ b/xlators/cluster/ec/src/Makefile.am @@ -0,0 +1,83 @@ +xlator_LTLIBRARIES = ec.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +ec_sources := ec.c +ec_sources += ec-data.c +ec_sources += ec-helpers.c +ec_sources += ec-common.c +ec_sources += ec-generic.c +ec_sources += ec-locks.c +ec_sources += ec-dir-read.c +ec_sources += ec-dir-write.c +ec_sources += ec-inode-read.c +ec_sources += ec-inode-write.c +ec_sources += ec-combine.c +ec_sources += ec-method.c +ec_sources += ec-galois.c +ec_sources += ec-code.c +ec_sources += ec-code-c.c +ec_sources += ec-gf8.c +ec_sources += ec-heal.c +ec_sources += ec-heald.c + +ec_headers := ec.h +ec_headers += ec-mem-types.h +ec_headers += ec-helpers.h +ec_headers += ec-data.h +ec_headers += ec-fops.h +ec_headers += ec-common.h +ec_headers += ec-combine.h +ec_headers += ec-method.h +ec_headers += ec-galois.h +ec_headers += ec-code.h +ec_headers += ec-code-c.h +ec_headers += ec-gf8.h +ec_headers += ec-heald.h +ec_headers += ec-messages.h +ec_headers += ec-types.h + +if ENABLE_EC_DYNAMIC_INTEL + ec_sources += ec-code-intel.c + ec_headers += ec-code-intel.h +endif + +if ENABLE_EC_DYNAMIC_X64 + ec_sources += ec-code-x64.c + ec_headers += ec-code-x64.h +endif + +if ENABLE_EC_DYNAMIC_SSE + ec_sources += ec-code-sse.c + ec_headers += ec-code-sse.h +endif + +if ENABLE_EC_DYNAMIC_AVX + ec_sources += ec-code-avx.c + ec_headers += ec-code-avx.h +endif + +ec_ext_sources = $(top_builddir)/xlators/lib/src/libxlator.c + +ec_ext_headers = $(top_builddir)/xlators/lib/src/libxlator.h + +ec_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) +ec_la_SOURCES = $(ec_sources) $(ec_headers) $(ec_ext_sources) $(ec_ext_headers) +ec_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CPPFLAGS = $(GF_CPPFLAGS) +AM_CPPFLAGS += -I$(top_srcdir)/libglusterfs/src +AM_CPPFLAGS += -I$(top_srcdir)/xlators/lib/src +AM_CPPFLAGS += -I$(top_srcdir)/rpc/rpc-lib/src +AM_CPPFLAGS += -I$(top_srcdir)/rpc/xdr/src +AM_CPPFLAGS += -I$(top_builddir)/rpc/xdr/src +AM_CPPFLAGS += -DGLUSTERFS_LIBEXECDIR=\"$(GLUSTERFS_LIBEXECDIR)\" + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = + +install-data-hook: + ln -sf ec.so $(DESTDIR)$(xlatordir)/disperse.so + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/disperse.so diff --git a/xlators/cluster/ec/src/ec-code-avx.c b/xlators/cluster/ec/src/ec-code-avx.c new file mode 100644 index 00000000000..70afaa00f54 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-avx.c @@ -0,0 +1,109 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <errno.h> + +#include "ec-code-intel.h" + +static void +ec_code_avx_prolog(ec_code_builder_t *builder) +{ + builder->loop = builder->address; +} + +static void +ec_code_avx_epilog(ec_code_builder_t *builder) +{ + ec_code_intel_op_add_i2r(builder, 32, REG_DX); + ec_code_intel_op_add_i2r(builder, 32, REG_DI); + ec_code_intel_op_test_i2r(builder, builder->width - 1, REG_DX); + ec_code_intel_op_jne(builder, builder->loop); + + ec_code_intel_op_ret(builder, 0); +} + +static void +ec_code_avx_load(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, + uint32_t bit) +{ + if (builder->linear) { + ec_code_intel_op_mov_m2avx( + builder, REG_SI, REG_DX, 1, + idx * builder->width * builder->bits + bit * builder->width, dst); + } else { + if (builder->base != idx) { + ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, + REG_AX); + builder->base = idx; + } + ec_code_intel_op_mov_m2avx(builder, REG_AX, REG_DX, 1, + bit * builder->width, dst); + } +} + +static void +ec_code_avx_store(ec_code_builder_t *builder, uint32_t src, uint32_t bit) +{ + ec_code_intel_op_mov_avx2m(builder, src, REG_DI, REG_NULL, 0, + bit * builder->width); +} + +static void +ec_code_avx_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ + ec_code_intel_op_mov_avx2avx(builder, src, dst); +} + +static void +ec_code_avx_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ + ec_code_intel_op_xor_avx2avx(builder, src, dst); +} + +static void +ec_code_avx_xor3(ec_code_builder_t *builder, uint32_t dst, uint32_t src1, + uint32_t src2) +{ + ec_code_intel_op_mov_avx2avx(builder, src1, dst); + ec_code_intel_op_xor_avx2avx(builder, src2, dst); +} + +static void +ec_code_avx_xorm(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, + uint32_t bit) +{ + if (builder->linear) { + ec_code_intel_op_xor_m2avx( + builder, REG_SI, REG_DX, 1, + idx * builder->width * builder->bits + bit * builder->width, dst); + } else { + if (builder->base != idx) { + ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, + REG_AX); + builder->base = idx; + } + ec_code_intel_op_xor_m2avx(builder, REG_AX, REG_DX, 1, + bit * builder->width, dst); + } +} + +static char *ec_code_avx_needed_flags[] = {"avx2", NULL}; + +ec_code_gen_t ec_code_gen_avx = {.name = "avx", + .flags = ec_code_avx_needed_flags, + .width = 32, + .prolog = ec_code_avx_prolog, + .epilog = ec_code_avx_epilog, + .load = ec_code_avx_load, + .store = ec_code_avx_store, + .copy = ec_code_avx_copy, + .xor2 = ec_code_avx_xor2, + .xor3 = ec_code_avx_xor3, + .xorm = ec_code_avx_xorm}; diff --git a/xlators/cluster/ec/src/ec-code-avx.h b/xlators/cluster/ec/src/ec-code-avx.h new file mode 100644 index 00000000000..fdca4ad2c8f --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-avx.h @@ -0,0 +1,18 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_AVX_H__ +#define __EC_CODE_AVX_H__ + +#include "ec-code.h" + +extern ec_code_gen_t ec_code_gen_avx; + +#endif /* __EC_CODE_AVX_H__ */ diff --git a/xlators/cluster/ec/src/ec-code-c.c b/xlators/cluster/ec/src/ec-code-c.c new file mode 100644 index 00000000000..acdc665c2cf --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-c.c @@ -0,0 +1,11679 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <inttypes.h> +#include <string.h> + +#include "ec-method.h" +#include "ec-code-c.h" + +#define WIDTH (EC_METHOD_WORD_SIZE / sizeof(uint64_t)) + +static void +gf8_muladd_00(void *out, void *in) +{ + memcpy(out, in, EC_METHOD_WORD_SIZE * 8); +} + +static void +gf8_muladd_01(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + out_ptr[0] ^= in_ptr[0]; + out_ptr[WIDTH] ^= in_ptr[WIDTH]; + out_ptr[WIDTH * 2] ^= in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] ^= in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] ^= in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] ^= in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] ^= in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] ^= in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_02(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in7; + out1 = in0; + out7 = in6; + out5 = in4; + out6 = in5; + out3 = in2 ^ in7; + out4 = in3 ^ in7; + out2 = in1 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_03(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in0 ^ in7; + tmp0 = in2 ^ in7; + out1 = in0 ^ in1; + out7 = in6 ^ in7; + out5 = in4 ^ in5; + out6 = in5 ^ in6; + out4 = in3 ^ in4 ^ in7; + out2 = tmp0 ^ in1; + out3 = tmp0 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_04(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in6; + out1 = in7; + out7 = in5; + out6 = in4; + tmp0 = in6 ^ in7; + out2 = in0 ^ in6; + out5 = in3 ^ in7; + out3 = tmp0 ^ in1; + out4 = tmp0 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_05(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in0 ^ in6; + out1 = in1 ^ in7; + out7 = in5 ^ in7; + out6 = in4 ^ in6; + out2 = out0 ^ in2; + out3 = out1 ^ in3 ^ in6; + out5 = out7 ^ in3; + out4 = out6 ^ in2 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_06(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in6 ^ in7; + tmp0 = in1 ^ in6; + out1 = in0 ^ in7; + out7 = in5 ^ in6; + out6 = in4 ^ in5; + out4 = in2 ^ in3 ^ in6; + out5 = in3 ^ in4 ^ in7; + out3 = tmp0 ^ in2; + out2 = tmp0 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_07(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in6; + tmp1 = in5 ^ in6; + tmp2 = in0 ^ in7; + tmp3 = tmp0 ^ in3; + out6 = tmp1 ^ in4; + out7 = tmp1 ^ in7; + out0 = tmp2 ^ in6; + out1 = tmp2 ^ in1; + out3 = tmp3 ^ in1; + out4 = tmp3 ^ in4; + out5 = out4 ^ out7 ^ in2; + out2 = tmp0 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_08(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in5; + out1 = in6; + out7 = in4; + out6 = in3 ^ in7; + out3 = in0 ^ in5 ^ in6; + out5 = in2 ^ in6 ^ in7; + out2 = in5 ^ in7; + out4 = out2 ^ in1 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_09(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in0 ^ in5; + tmp0 = in3 ^ in6; + out1 = in1 ^ in6; + out7 = in4 ^ in7; + out2 = in2 ^ in5 ^ in7; + out3 = tmp0 ^ out0; + out6 = tmp0 ^ in7; + out4 = out1 ^ out7 ^ in5; + out5 = out2 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_0A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in5 ^ in7; + out1 = in0 ^ in6; + out7 = in4 ^ in6; + out2 = in1 ^ in5; + out6 = out0 ^ in3; + out3 = out0 ^ out1 ^ in2; + out5 = out7 ^ in2 ^ in7; + out4 = out2 ^ in3 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_0B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in5; + tmp1 = in0 ^ in6; + tmp2 = in4 ^ in7; + out0 = in0 ^ in5 ^ in7; + out2 = tmp0 ^ in1; + out1 = tmp1 ^ in1; + out6 = tmp1 ^ out0 ^ in3; + out7 = tmp2 ^ in6; + out4 = tmp2 ^ out6 ^ in1; + out3 = out6 ^ in0 ^ in2; + out5 = tmp0 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_0C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in5 ^ in6; + out1 = in6 ^ in7; + out7 = in4 ^ in5; + tmp0 = in1 ^ in5; + tmp1 = in0 ^ in7; + out5 = in2 ^ in3 ^ in6; + out6 = in3 ^ in4 ^ in7; + out2 = tmp1 ^ out0; + out4 = tmp0 ^ in2; + out3 = tmp0 ^ tmp1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_0D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in4 ^ in5; + tmp1 = in5 ^ in6; + out1 = in1 ^ in6 ^ in7; + out7 = tmp0 ^ in7; + out4 = tmp0 ^ in1 ^ in2; + out0 = tmp1 ^ in0; + tmp2 = tmp1 ^ in3; + out6 = tmp2 ^ out7; + out2 = out0 ^ in2 ^ in7; + out3 = out0 ^ out1 ^ in3; + out5 = tmp2 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_0E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = in2 ^ in5; + tmp2 = in5 ^ in6; + out1 = in0 ^ in6 ^ in7; + out3 = tmp0 ^ tmp1; + out2 = tmp0 ^ tmp2; + tmp3 = tmp1 ^ in3; + out7 = tmp2 ^ in4; + out0 = tmp2 ^ in7; + out4 = tmp3 ^ in1 ^ in7; + out5 = tmp3 ^ out7; + out6 = out0 ^ out5 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_0F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in6 ^ in7; + tmp1 = tmp0 ^ in1; + tmp2 = tmp0 ^ in5; + out1 = tmp1 ^ in0; + out7 = tmp2 ^ in4; + out0 = tmp2 ^ in0; + out6 = out7 ^ in3; + out5 = out6 ^ in2 ^ in7; + tmp3 = tmp1 ^ out0 ^ in2; + out4 = tmp1 ^ out5; + out2 = tmp3 ^ in6; + out3 = tmp3 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_10(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in4; + out1 = in5; + out7 = in3 ^ in7; + tmp0 = in6 ^ in7; + out2 = in4 ^ in6; + tmp1 = out2 ^ in5; + out6 = tmp0 ^ in2; + out3 = tmp0 ^ tmp1; + out5 = out2 ^ out3 ^ in1; + out4 = tmp1 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_11(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out7 = in3; + out0 = in0 ^ in4; + out1 = in1 ^ in5; + out6 = in2 ^ in7; + out4 = in0 ^ in5 ^ in6; + out5 = in1 ^ in6 ^ in7; + out2 = in2 ^ in4 ^ in6; + out3 = in3 ^ in4 ^ in5 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_12(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in4 ^ in7; + out1 = in0 ^ in5; + out3 = in2 ^ in4 ^ in5; + tmp0 = out0 ^ in6; + out2 = tmp0 ^ in1; + tmp1 = tmp0 ^ in3; + out6 = tmp0 ^ out3; + out5 = out2 ^ in5; + out7 = tmp1 ^ in4; + out4 = tmp1 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_13(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out7 = in3 ^ in6; + tmp0 = in0 ^ in5; + tmp1 = in4 ^ in7; + out6 = in2 ^ in5 ^ in7; + out4 = tmp0 ^ out7 ^ in7; + out1 = tmp0 ^ in1; + out0 = tmp1 ^ in0; + out5 = tmp1 ^ in1 ^ in6; + out3 = tmp1 ^ out6 ^ in3; + out2 = out5 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_14(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in4 ^ in6; + out1 = in5 ^ in7; + out2 = in0 ^ in4; + tmp0 = out0 ^ in5; + out7 = out1 ^ in3; + tmp1 = out1 ^ in2; + out3 = tmp0 ^ in1; + out6 = tmp0 ^ tmp1; + out4 = tmp1 ^ out2; + out5 = out3 ^ in3 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_15(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out7 = in3 ^ in5; + tmp0 = in0 ^ in4; + out1 = in1 ^ in5 ^ in7; + out5 = in1 ^ in3 ^ in6; + out0 = tmp0 ^ in6; + out2 = tmp0 ^ in2; + out3 = out5 ^ in4 ^ in5; + out6 = out2 ^ in0 ^ in7; + out4 = tmp0 ^ out6 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_16(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in5; + tmp1 = in4 ^ in7; + tmp2 = in2 ^ in3 ^ in4; + out1 = tmp0 ^ in7; + out4 = tmp0 ^ tmp2; + out0 = tmp1 ^ in6; + tmp3 = tmp1 ^ in1; + out6 = out0 ^ in2 ^ in5; + out2 = tmp3 ^ in0; + out3 = out6 ^ in1; + out7 = tmp2 ^ out6; + out5 = tmp3 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_17(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in5; + tmp1 = in3 ^ in6; + tmp2 = tmp0 ^ in4; + out4 = tmp0 ^ in0 ^ in3; + out7 = tmp1 ^ in5; + tmp3 = tmp1 ^ in1; + out6 = tmp2 ^ in7; + out5 = tmp3 ^ in4; + out3 = tmp3 ^ out6; + out0 = out3 ^ out4 ^ in1; + out2 = out3 ^ out7 ^ in0; + out1 = tmp2 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_18(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in4 ^ in5; + out1 = in5 ^ in6; + tmp0 = in4 ^ in7; + out5 = in1 ^ in2 ^ in5; + out6 = in2 ^ in3 ^ in6; + out2 = tmp0 ^ out1; + out7 = tmp0 ^ in3; + tmp1 = tmp0 ^ in0; + out3 = tmp1 ^ in6; + out4 = tmp1 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_19(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out5 = in1 ^ in2; + out7 = in3 ^ in4; + tmp0 = in0 ^ in7; + out6 = in2 ^ in3; + out1 = in1 ^ in5 ^ in6; + out0 = in0 ^ in4 ^ in5; + out4 = tmp0 ^ in1; + tmp1 = tmp0 ^ in6; + out2 = tmp1 ^ out0 ^ in2; + out3 = tmp1 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_1A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in4 ^ in5; + tmp1 = in5 ^ in6; + tmp2 = tmp0 ^ in1; + out0 = tmp0 ^ in7; + out1 = tmp1 ^ in0; + tmp3 = tmp1 ^ in3; + out5 = tmp2 ^ in2; + out2 = tmp2 ^ in6; + out7 = tmp3 ^ out0; + out6 = tmp3 ^ in2; + out4 = tmp3 ^ out2 ^ in0; + out3 = tmp0 ^ out1 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_1B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in4; + tmp1 = in2 ^ in5; + tmp2 = in3 ^ in6; + out5 = tmp0 ^ in1; + tmp3 = tmp0 ^ in0; + out6 = tmp1 ^ in3; + out0 = tmp1 ^ tmp3 ^ in7; + out7 = tmp2 ^ in4; + tmp4 = out5 ^ in6; + out3 = tmp2 ^ tmp3; + out2 = tmp4 ^ in5; + out4 = tmp4 ^ out3; + out1 = tmp3 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_1C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3; + tmp1 = in4 ^ in6; + tmp2 = in5 ^ in7; + out6 = tmp0 ^ tmp1; + out0 = tmp1 ^ in5; + out1 = tmp2 ^ in6; + tmp3 = tmp2 ^ in1; + tmp4 = tmp2 ^ in4; + out2 = tmp4 ^ in0; + out7 = tmp4 ^ in3; + out5 = tmp0 ^ tmp3; + out3 = tmp3 ^ out2; + out4 = out3 ^ in2 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_1D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in3; + tmp1 = in0 ^ in4; + tmp2 = in3 ^ in4; + tmp3 = in2 ^ in7; + out3 = tmp0 ^ tmp1; + out5 = tmp0 ^ tmp3; + tmp4 = tmp1 ^ in5; + out6 = tmp2 ^ in2; + out7 = tmp2 ^ in5; + out2 = tmp3 ^ tmp4; + out4 = out3 ^ out6 ^ in6; + out0 = tmp4 ^ in6; + out1 = out2 ^ out4 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_1E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in4; + tmp1 = in2 ^ in7; + tmp2 = tmp0 ^ in1; + out3 = tmp1 ^ tmp2; + out2 = tmp2 ^ in5; + out4 = out3 ^ in3 ^ in6; + tmp3 = out4 ^ in7; + out6 = tmp3 ^ out2 ^ in4; + out7 = tmp1 ^ out6; + out0 = out7 ^ in3; + out1 = tmp0 ^ out0; + out5 = tmp3 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_1F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in4 ^ in6; + tmp1 = tmp0 ^ in5; + out7 = tmp1 ^ in3; + out0 = tmp1 ^ in0 ^ in7; + out6 = out7 ^ in2 ^ in6; + out1 = out0 ^ in1 ^ in4; + out4 = out0 ^ out6 ^ in1; + out3 = tmp0 ^ out4; + out2 = out4 ^ out7 ^ in7; + out5 = out3 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_20(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in4; + out0 = in3 ^ in7; + tmp0 = in3 ^ in4; + tmp1 = in6 ^ in7; + out2 = out0 ^ in5; + out4 = tmp0 ^ in5; + out3 = tmp0 ^ tmp1; + out7 = tmp1 ^ in2; + out6 = tmp1 ^ in1 ^ in5; + out5 = out2 ^ out3 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_21(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in1 ^ in4; + tmp0 = in4 ^ in6; + out4 = in3 ^ in5; + out7 = in2 ^ in6; + out0 = in0 ^ in3 ^ in7; + out6 = in1 ^ in5 ^ in7; + out3 = tmp0 ^ in7; + out5 = tmp0 ^ in0; + out2 = out4 ^ in2 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_22(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in3; + out1 = in0 ^ in4; + out7 = in2 ^ in7; + out4 = in4 ^ in5 ^ in7; + out5 = in0 ^ in5 ^ in6; + out6 = in1 ^ in6 ^ in7; + out3 = in2 ^ in3 ^ in4 ^ in6; + out2 = in1 ^ in3 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_23(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out7 = in2; + out0 = in0 ^ in3; + out4 = in5 ^ in7; + out5 = in0 ^ in6; + out6 = in1 ^ in7; + out3 = in2 ^ in4 ^ in6; + out1 = in0 ^ in1 ^ in4; + out2 = out4 ^ out6 ^ in2 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_24(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in4 ^ in7; + tmp0 = in3 ^ in4; + out0 = in3 ^ in6 ^ in7; + out3 = tmp0 ^ in1; + tmp1 = out0 ^ in5; + out6 = tmp1 ^ out3; + out2 = tmp1 ^ in0; + out7 = tmp1 ^ in2 ^ in3; + out5 = out2 ^ in4; + out4 = tmp0 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_25(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in1 ^ in4; + tmp0 = in2 ^ in5; + out1 = out3 ^ in7; + out7 = tmp0 ^ in6; + out6 = out1 ^ in5; + out4 = out7 ^ in3 ^ in7; + out2 = out4 ^ in0; + out0 = tmp0 ^ out2; + out5 = out0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_26(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in3 ^ in6; + tmp0 = in4 ^ in7; + out7 = in2 ^ in5 ^ in7; + tmp1 = out0 ^ in0 ^ in5; + out1 = tmp0 ^ in0; + tmp2 = tmp0 ^ in6; + out2 = tmp1 ^ in1; + out5 = tmp1 ^ in7; + out6 = tmp2 ^ in1; + out4 = tmp2 ^ out7; + out3 = out0 ^ out6 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_27(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out7 = in2 ^ in5; + out0 = in0 ^ in3 ^ in6; + out6 = in1 ^ in4 ^ in7; + out4 = out7 ^ in6; + out2 = out0 ^ out7 ^ in1; + out5 = out0 ^ in7; + out1 = out6 ^ in0; + out3 = out6 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_28(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in3; + out1 = in4 ^ in6; + out0 = in3 ^ in5 ^ in7; + tmp0 = out1 ^ in7; + tmp1 = out0 ^ in4; + out7 = tmp0 ^ in2; + tmp2 = tmp0 ^ in1; + out3 = tmp1 ^ in0; + out6 = tmp1 ^ tmp2; + out4 = tmp2 ^ in3; + out5 = out3 ^ in2 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_29(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in2 ^ in3; + tmp0 = in1 ^ in3; + tmp1 = in4 ^ in6; + tmp2 = in0 ^ in4 ^ in7; + out6 = tmp0 ^ in5; + out4 = tmp0 ^ in6 ^ in7; + out1 = tmp1 ^ in1; + out7 = tmp1 ^ in2; + out3 = tmp2 ^ in5; + out5 = tmp2 ^ in2; + out0 = out3 ^ in3 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_2A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in3 ^ in5; + tmp0 = in1 ^ in3; + tmp1 = in0 ^ in4; + out7 = in2 ^ in4 ^ in7; + out3 = tmp1 ^ out0 ^ in2; + out2 = tmp0 ^ in7; + out6 = tmp0 ^ in6; + out1 = tmp1 ^ in6; + out5 = tmp1 ^ out7 ^ in5; + out4 = out1 ^ in0 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_2B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in1 ^ in6; + out7 = in2 ^ in4; + tmp0 = in0 ^ in5; + tmp1 = in2 ^ in7; + out6 = in1 ^ in3; + out1 = out4 ^ in0 ^ in4; + out3 = tmp0 ^ out7; + out0 = tmp0 ^ in3; + out5 = tmp1 ^ in0; + out2 = tmp1 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_2C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in5; + tmp1 = in2 ^ in3 ^ in4; + tmp2 = tmp0 ^ in6; + out4 = tmp1 ^ in1; + out5 = tmp1 ^ in0 ^ in5; + tmp3 = tmp2 ^ in4; + out6 = tmp2 ^ out4; + out7 = tmp3 ^ in7; + out2 = tmp3 ^ out5; + out3 = out6 ^ in0; + out0 = tmp1 ^ out7; + out1 = tmp0 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_2D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3; + out4 = tmp0 ^ in1; + tmp1 = tmp0 ^ in0; + out2 = tmp1 ^ in6; + out5 = tmp1 ^ in4; + tmp2 = out2 ^ in2; + tmp3 = tmp2 ^ in5; + out0 = tmp3 ^ in7; + out7 = tmp3 ^ out5; + out6 = out4 ^ out7 ^ in6; + out3 = tmp2 ^ out6; + out1 = out0 ^ out6 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_2E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in4 ^ in7; + out0 = in3 ^ in5 ^ in6; + tmp1 = tmp0 ^ in0; + tmp2 = tmp0 ^ in2; + out1 = tmp1 ^ in6; + out4 = tmp2 ^ in1; + out7 = tmp2 ^ in5; + out3 = out0 ^ out4 ^ in0; + out2 = out3 ^ out7 ^ in7; + out6 = tmp1 ^ out2; + out5 = tmp1 ^ out7 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_2F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + tmp1 = in2 ^ in5; + out4 = in1 ^ in2 ^ in7; + out6 = in1 ^ in3 ^ in4; + out5 = tmp0 ^ in2; + tmp2 = tmp0 ^ in6; + out7 = tmp1 ^ in4; + out0 = tmp2 ^ in5; + out2 = tmp2 ^ out4; + out1 = tmp2 ^ out6 ^ in7; + out3 = tmp1 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_30(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in4 ^ in5; + tmp0 = in3 ^ in6; + tmp1 = in4 ^ in7; + out6 = in1 ^ in2 ^ in5; + out3 = tmp0 ^ in5; + out4 = tmp0 ^ in0; + out7 = tmp0 ^ in2; + out0 = tmp1 ^ in3; + out2 = tmp1 ^ out3; + out5 = tmp1 ^ in0 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_31(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in5 ^ in6; + tmp0 = in4 ^ in5; + tmp1 = in0 ^ in3 ^ in4; + tmp2 = out3 ^ in2; + out1 = tmp0 ^ in1; + out0 = tmp1 ^ in7; + out4 = tmp1 ^ in6; + out6 = tmp2 ^ in1; + out2 = tmp2 ^ out0 ^ in0; + out5 = out1 ^ in0 ^ in7; + out7 = tmp0 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_32(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in3 ^ in4; + out7 = in2 ^ in3; + tmp0 = in5 ^ in6; + tmp1 = in0 ^ in7; + out6 = in1 ^ in2; + out1 = in0 ^ in4 ^ in5; + out2 = tmp0 ^ out0 ^ in1; + out3 = tmp0 ^ out7 ^ in7; + out4 = tmp1 ^ in6; + out5 = tmp1 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_33(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3; + tmp1 = in0 ^ in4; + tmp2 = in1 ^ in5; + out6 = in1 ^ in2 ^ in6; + out7 = tmp0 ^ in7; + out0 = tmp1 ^ in3; + out1 = tmp1 ^ tmp2; + tmp3 = tmp2 ^ in7; + tmp4 = tmp2 ^ in4 ^ in6; + out5 = tmp3 ^ in0; + out3 = tmp3 ^ out6; + out4 = tmp4 ^ out5; + out2 = tmp0 ^ tmp4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_34(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in4; + tmp1 = in4 ^ in5; + tmp2 = tmp0 ^ in1; + tmp3 = tmp0 ^ in6; + out1 = tmp1 ^ in7; + tmp4 = tmp1 ^ in2; + out5 = tmp2 ^ in0; + out3 = tmp2 ^ out1; + out0 = tmp3 ^ in7; + out7 = tmp3 ^ tmp4; + out6 = tmp4 ^ in1; + out2 = out3 ^ out5 ^ in3; + out4 = tmp4 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_35(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in6; + tmp1 = in5 ^ in7; + out7 = tmp0 ^ tmp1 ^ in3; + out3 = tmp1 ^ in1; + out1 = out3 ^ in4; + tmp2 = out1 ^ in7; + out5 = tmp2 ^ in0 ^ in3; + out6 = tmp0 ^ tmp2; + out0 = out3 ^ out5 ^ in6; + out4 = tmp0 ^ out0; + out2 = out4 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_36(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in0 ^ in2; + tmp0 = in1 ^ in3; + out0 = in3 ^ in4 ^ in6; + out6 = in1 ^ in2 ^ in4; + out5 = tmp0 ^ in0; + tmp1 = out5 ^ in5; + out2 = tmp1 ^ in4; + out3 = tmp1 ^ out4; + out1 = tmp0 ^ out2 ^ in7; + out7 = out3 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_37(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2; + tmp1 = in2 ^ in4; + tmp2 = tmp0 ^ in6; + out3 = tmp0 ^ in5; + out4 = tmp1 ^ in0; + out6 = tmp2 ^ in4; + out1 = out3 ^ out4 ^ in7; + tmp3 = out4 ^ in1 ^ in3; + out7 = tmp3 ^ out1; + out2 = tmp3 ^ in5; + out5 = tmp1 ^ out2; + out0 = tmp2 ^ tmp3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_38(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in0 ^ in3; + tmp0 = in3 ^ in4; + tmp1 = in5 ^ in7; + tmp2 = out3 ^ in1; + out2 = tmp0 ^ in6; + out0 = tmp0 ^ tmp1; + out4 = tmp1 ^ tmp2; + out7 = out2 ^ in2; + out1 = out2 ^ in3 ^ in5; + out6 = out4 ^ in0 ^ in2; + out5 = tmp2 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_39(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in0; + tmp0 = in1 ^ in5; + tmp1 = tmp0 ^ in4; + out1 = tmp1 ^ in6; + out5 = out1 ^ in0 ^ in2; + tmp2 = tmp0 ^ out5; + out2 = tmp2 ^ in0 ^ in3; + out7 = out2 ^ in7; + out6 = tmp1 ^ out7; + out4 = tmp2 ^ out6; + out0 = out4 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_3A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = in0 ^ in2; + tmp2 = in3 ^ in4; + tmp3 = in1 ^ in6; + tmp4 = in3 ^ in7; + out4 = tmp0 ^ in5; + out5 = tmp1 ^ tmp3; + out3 = tmp1 ^ tmp4; + out0 = tmp2 ^ in5; + out7 = tmp2 ^ in2; + tmp5 = tmp3 ^ in4; + out2 = tmp4 ^ tmp5; + out1 = tmp5 ^ out4; + out6 = tmp0 ^ out3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_3B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in6; + tmp1 = in2 ^ in7; + tmp2 = tmp0 ^ in3; + out3 = tmp1 ^ in0; + out6 = tmp1 ^ tmp2; + out2 = out6 ^ in4; + out7 = tmp0 ^ out2; + out0 = out3 ^ out7 ^ in5; + out5 = out0 ^ out2 ^ in7; + out1 = tmp2 ^ out0; + out4 = out1 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_3C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + tmp1 = in2 ^ in7; + tmp2 = in1 ^ in6 ^ in7; + out2 = tmp0 ^ in4; + out3 = tmp0 ^ tmp2; + out4 = tmp1 ^ out3 ^ in5; + out5 = tmp2 ^ out2 ^ in2; + out1 = out4 ^ out5 ^ in6; + out0 = out1 ^ in3; + out7 = tmp1 ^ out0; + out6 = tmp2 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_3D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in2; + tmp1 = tmp0 ^ in3; + out2 = tmp1 ^ in4; + tmp2 = out2 ^ in5; + out4 = tmp2 ^ in1 ^ in6; + out5 = out4 ^ in7; + out6 = out5 ^ in0; + out7 = out6 ^ in1; + out0 = tmp0 ^ out7; + out1 = tmp1 ^ out5; + out3 = tmp2 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_3E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in5; + tmp1 = tmp0 ^ in4; + out0 = tmp1 ^ in6; + out7 = tmp1 ^ in2; + out6 = out7 ^ in1 ^ in5 ^ in7; + out2 = out6 ^ in0 ^ in2; + out4 = out0 ^ out6 ^ in0; + out5 = tmp0 ^ out4; + out3 = out5 ^ in7; + out1 = out3 ^ out6 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_3F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + out3 = tmp0 ^ in2 ^ in6; + tmp1 = out3 ^ in5 ^ in7; + out4 = tmp1 ^ in4; + out5 = tmp1 ^ in3; + out1 = out4 ^ in2; + out7 = out1 ^ out3 ^ in3; + out2 = tmp0 ^ out7 ^ in5; + tmp2 = out2 ^ in0; + out6 = tmp2 ^ in6; + out0 = tmp1 ^ tmp2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_40(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in3 ^ in7; + tmp0 = in3 ^ in4; + tmp1 = in6 ^ in7; + out4 = tmp0 ^ in2; + out5 = tmp0 ^ in5; + out0 = tmp1 ^ in2; + out7 = tmp1 ^ in1 ^ in5; + out2 = out0 ^ in4; + out3 = out2 ^ out5 ^ in7; + out6 = out3 ^ out4 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_41(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in2 ^ in3; + tmp0 = in5 ^ in6; + tmp1 = in6 ^ in7; + out5 = in3 ^ in4; + out1 = in1 ^ in3 ^ in7; + out6 = in0 ^ in4 ^ in5; + out3 = tmp0 ^ in2; + out7 = tmp0 ^ in1; + out2 = tmp1 ^ in4; + out0 = tmp1 ^ in0 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_42(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in2 ^ in6; + out5 = in3 ^ in5; + out1 = in0 ^ in3 ^ in7; + out7 = in1 ^ in5 ^ in7; + out4 = in2 ^ in4 ^ in7; + out6 = in0 ^ in4 ^ in6; + out2 = out0 ^ in1 ^ in4; + out3 = out5 ^ in6 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_43(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out5 = in3; + out7 = in1 ^ in5; + out4 = in2 ^ in7; + out6 = in0 ^ in4; + out0 = in0 ^ in2 ^ in6; + out3 = in5 ^ in6 ^ in7; + out2 = in1 ^ in4 ^ in6; + out1 = in0 ^ in1 ^ in3 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_44(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in3; + out0 = in2 ^ in7; + tmp0 = in4 ^ in7; + out7 = in1 ^ in6 ^ in7; + out6 = in0 ^ in5 ^ in6; + out4 = tmp0 ^ in3 ^ in6; + out3 = out0 ^ in1 ^ in3 ^ in5; + out2 = out0 ^ in0 ^ in4; + out5 = tmp0 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_45(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in1 ^ in3; + out7 = in1 ^ in6; + out5 = in4 ^ in7; + out6 = in0 ^ in5; + out0 = in0 ^ in2 ^ in7; + out4 = in3 ^ in6 ^ in7; + out2 = out5 ^ in0; + out3 = out0 ^ out6 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_46(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in2; + out1 = in0 ^ in3; + out7 = in1 ^ in7; + out4 = in4 ^ in6; + out5 = in5 ^ in7; + out6 = in0 ^ in6; + out3 = in1 ^ in3 ^ in5; + out2 = out4 ^ out6 ^ in1 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_47(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in6; + out7 = in1; + out5 = in7; + out6 = in0; + tmp0 = in0 ^ in1; + out3 = in1 ^ in5; + out0 = in0 ^ in2; + out1 = tmp0 ^ in3; + out2 = tmp0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_48(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3; + out1 = in3 ^ in6 ^ in7; + out3 = tmp0 ^ in0; + out0 = tmp0 ^ out1 ^ in5; + tmp1 = out0 ^ in4; + out2 = tmp1 ^ in7; + out5 = tmp1 ^ in3; + out4 = out5 ^ in1; + out7 = tmp0 ^ out4; + out6 = tmp1 ^ out3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_49(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in0 ^ in2; + tmp0 = in2 ^ in5; + out2 = in4 ^ in5 ^ in6; + tmp1 = tmp0 ^ out2 ^ in3; + out7 = out2 ^ in1; + out5 = tmp1 ^ in7; + out4 = out5 ^ out7 ^ in6; + out1 = tmp0 ^ out4; + out6 = out1 ^ out7 ^ in0; + out0 = tmp1 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_4A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in6; + tmp1 = in3 ^ in7; + out0 = tmp0 ^ in5; + out3 = tmp1 ^ in0; + out5 = tmp1 ^ out0; + out4 = out0 ^ in1 ^ in4; + out1 = out3 ^ in6; + out2 = out4 ^ in7; + out6 = out1 ^ in4; + out7 = tmp0 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_4B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in0 ^ in7; + tmp0 = in1 ^ in5; + tmp1 = in2 ^ in6; + tmp2 = out3 ^ in3; + out7 = tmp0 ^ in4; + out4 = tmp0 ^ tmp1; + tmp3 = tmp1 ^ in0; + out6 = tmp2 ^ in4; + out5 = tmp2 ^ tmp3; + out1 = tmp2 ^ in1 ^ in6; + out2 = out7 ^ in6 ^ in7; + out0 = tmp3 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_4C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in3 ^ in6; + tmp0 = in2 ^ in5; + tmp1 = out1 ^ in5 ^ in7; + out0 = tmp0 ^ in7; + tmp2 = tmp0 ^ in4; + out6 = tmp1 ^ in0; + out2 = tmp2 ^ in0; + out5 = tmp2 ^ in6; + out3 = tmp0 ^ out6 ^ in1; + out7 = out0 ^ out5 ^ in1; + out4 = tmp1 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_4D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in5; + tmp1 = in1 ^ in6; + out4 = in1 ^ in3 ^ in5; + tmp2 = tmp0 ^ in7; + out2 = tmp0 ^ in4; + out1 = tmp1 ^ in3; + out7 = tmp1 ^ in4; + out0 = tmp2 ^ in2; + out6 = tmp2 ^ in3; + out5 = out7 ^ in1 ^ in2; + out3 = tmp1 ^ out0 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_4E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in2 ^ in5; + out7 = in1 ^ in4 ^ in7; + out1 = in0 ^ in3 ^ in6; + out5 = out0 ^ in6; + out4 = out7 ^ in5; + out3 = out1 ^ in1; + out6 = out1 ^ in7; + out2 = out4 ^ in0 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_4F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out5 = in2 ^ in6; + out7 = in1 ^ in4; + out3 = in0 ^ in1 ^ in6; + out4 = in1 ^ in5 ^ in7; + out0 = in0 ^ in2 ^ in5; + out6 = in0 ^ in3 ^ in7; + out1 = out3 ^ in3; + out2 = out4 ^ in0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_50(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in2 ^ in7; + tmp0 = in3 ^ in5; + out0 = out2 ^ in4 ^ in6; + out1 = tmp0 ^ in7; + tmp1 = tmp0 ^ in6; + out3 = out0 ^ in3; + out7 = tmp1 ^ in1; + tmp2 = tmp1 ^ in0; + out5 = out3 ^ in1 ^ in2; + out4 = tmp2 ^ in2; + out6 = tmp2 ^ out3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_51(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in7; + out3 = in2 ^ in4 ^ in6 ^ in7; + out0 = out3 ^ in0; + out6 = out0 ^ in5; + out4 = out6 ^ in3 ^ in7; + out1 = out0 ^ out4 ^ in1; + out7 = out1 ^ in6; + out5 = out7 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_52(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in1 ^ in2; + tmp0 = in2 ^ in4; + tmp1 = in3 ^ in5; + tmp2 = in3 ^ in6; + tmp3 = in0 ^ in7; + out0 = tmp0 ^ in6; + out6 = tmp0 ^ tmp3; + out7 = tmp1 ^ in1; + out1 = tmp1 ^ tmp3; + out3 = tmp2 ^ in4; + out5 = tmp2 ^ in1 ^ in7; + out4 = tmp2 ^ out1 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_53(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in1; + out3 = in4 ^ in6; + out0 = out3 ^ in0 ^ in2; + out6 = out0 ^ in7; + out4 = out6 ^ in5; + out7 = out0 ^ out4 ^ in1 ^ in3; + out1 = out7 ^ in0; + out5 = out7 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_54(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in3 ^ in5; + tmp0 = in1 ^ in3; + tmp1 = in2 ^ in4; + tmp2 = in0 ^ in7; + out5 = in1 ^ in4 ^ in6; + out4 = tmp2 ^ out1; + out7 = tmp0 ^ in6; + out3 = tmp0 ^ tmp1; + out0 = tmp1 ^ in7; + tmp3 = tmp2 ^ in2; + out2 = tmp3 ^ in6; + out6 = tmp3 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_55(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in3; + tmp1 = in1 ^ in4; + tmp2 = in6 ^ in7; + out7 = tmp0 ^ tmp2; + out1 = tmp0 ^ in5; + out3 = tmp1 ^ in2; + out5 = tmp1 ^ in5 ^ in6; + out2 = tmp2 ^ in0; + out4 = out5 ^ out7 ^ in0; + out6 = out2 ^ in2 ^ in5; + out0 = out5 ^ out6 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_56(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in2 ^ in4; + tmp0 = in0 ^ in2; + out4 = in0 ^ in5; + out7 = in1 ^ in3; + out5 = in1 ^ in6; + out6 = tmp0 ^ in7; + out2 = tmp0 ^ out5; + out1 = out4 ^ in3; + out3 = out7 ^ in4 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_57(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in5; + tmp1 = in1 ^ in7; + out0 = in0 ^ in2 ^ in4; + out5 = in1 ^ in5 ^ in6; + out4 = tmp0 ^ in4; + out1 = tmp0 ^ in1 ^ in3; + out2 = tmp0 ^ out5; + out3 = tmp1 ^ in4; + out7 = tmp1 ^ in3; + out6 = tmp1 ^ out2 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_58(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in2 ^ in5; + tmp0 = in2 ^ in3 ^ in4; + out5 = tmp0 ^ in1; + out6 = tmp0 ^ in0 ^ in5; + out3 = out6 ^ in7; + tmp1 = out2 ^ out5; + out7 = tmp1 ^ in6; + out4 = tmp1 ^ out3 ^ in3; + out0 = out4 ^ out7 ^ in0; + out1 = tmp0 ^ out0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_59(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in5; + tmp0 = in0 ^ in5 ^ in7; + out3 = tmp0 ^ in2 ^ in4; + out0 = out3 ^ in6; + tmp1 = out0 ^ in7; + out6 = tmp1 ^ in3; + out5 = out6 ^ in0 ^ in1 ^ in6; + out4 = tmp0 ^ out5; + out1 = tmp1 ^ out4; + out7 = out1 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_5A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2; + tmp1 = in2 ^ in5; + out5 = tmp0 ^ in3; + out4 = tmp0 ^ in0; + tmp2 = tmp1 ^ in4; + out2 = tmp1 ^ in1 ^ in7; + out7 = tmp2 ^ out5; + out6 = out4 ^ out7 ^ in5; + out0 = tmp2 ^ in6; + out1 = out0 ^ out6 ^ in7; + out3 = tmp1 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_5B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3; + tmp1 = in0 ^ in4; + tmp2 = in1 ^ in5; + out5 = tmp0 ^ tmp2; + tmp3 = tmp1 ^ in6; + out3 = tmp1 ^ in5; + out2 = tmp2 ^ in7; + tmp4 = out3 ^ in2; + out7 = out2 ^ in3 ^ in4; + out0 = tmp4 ^ in6; + out6 = tmp0 ^ tmp3; + out4 = tmp2 ^ tmp4; + out1 = tmp3 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_5C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in6; + tmp1 = in0 ^ in2 ^ in5; + out1 = tmp0 ^ in5; + tmp2 = tmp0 ^ in1; + out2 = tmp1 ^ in6; + out6 = tmp1 ^ in3; + out4 = tmp2 ^ in0; + out7 = tmp2 ^ in4; + out3 = tmp1 ^ out7; + out0 = out3 ^ out4 ^ in7; + out5 = out0 ^ in1 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_5D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = in0 ^ in6; + out2 = tmp1 ^ in5; + tmp2 = out2 ^ in3; + out6 = tmp2 ^ in2; + out1 = tmp0 ^ tmp2; + tmp3 = out1 ^ in4 ^ in5; + out4 = tmp3 ^ in0; + out7 = tmp3 ^ in7; + tmp4 = out4 ^ out6; + out5 = tmp4 ^ in7; + out0 = tmp0 ^ out5; + out3 = tmp1 ^ tmp4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_5E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in5; + tmp1 = in3 ^ in5; + tmp2 = in1 ^ in7; + out7 = in1 ^ in3 ^ in4; + out0 = tmp0 ^ in4; + tmp3 = tmp1 ^ in0; + out5 = tmp2 ^ in2; + out1 = tmp3 ^ in6; + out6 = tmp0 ^ tmp3; + tmp4 = tmp2 ^ out1; + out3 = tmp4 ^ in4; + out4 = tmp1 ^ tmp4; + out2 = tmp0 ^ out4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_5F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in5; + tmp1 = in0 ^ in6; + tmp2 = tmp0 ^ in7; + tmp3 = tmp1 ^ in3; + out2 = tmp1 ^ tmp2; + out5 = tmp2 ^ in2; + out6 = tmp3 ^ in2; + out3 = out2 ^ in4; + out4 = out3 ^ in5; + out1 = tmp0 ^ tmp3; + out7 = tmp3 ^ out4; + out0 = out4 ^ out5 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_60(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in2 ^ in5; + tmp0 = in3 ^ in6; + out1 = in3 ^ in4 ^ in7; + out7 = out4 ^ in1; + tmp1 = out4 ^ in4; + out0 = tmp0 ^ in2; + out5 = tmp0 ^ in0; + out2 = tmp0 ^ tmp1; + out3 = tmp1 ^ in7; + out6 = out3 ^ out7 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_61(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in5; + out4 = tmp0 ^ in4; + tmp1 = out4 ^ in3; + out3 = tmp1 ^ in7; + out2 = tmp1 ^ in2 ^ in6; + out1 = tmp0 ^ out3 ^ in1; + out0 = out2 ^ out4 ^ in0; + out7 = tmp1 ^ out1; + out6 = out0 ^ out1 ^ in2; + out5 = tmp0 ^ out0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_62(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in4 ^ in5; + tmp0 = in0 ^ in3 ^ in4; + out1 = tmp0 ^ in7; + out5 = tmp0 ^ in6; + tmp1 = out1 ^ in0; + tmp2 = tmp1 ^ out3; + out4 = tmp2 ^ in2; + tmp3 = tmp2 ^ in1; + out0 = out4 ^ in5 ^ in6; + out7 = tmp3 ^ out0; + out6 = tmp0 ^ tmp3; + out2 = tmp1 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_63(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in4; + tmp1 = in1 ^ in7; + out3 = tmp0 ^ in5; + tmp2 = out3 ^ in6; + out4 = out3 ^ in2 ^ in7; + out5 = tmp2 ^ in0; + tmp3 = out5 ^ in3; + out0 = tmp3 ^ out4; + out2 = tmp1 ^ tmp2; + out6 = tmp1 ^ tmp3; + tmp4 = tmp0 ^ out2; + out1 = tmp4 ^ out5; + out7 = tmp4 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_64(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in2 ^ in3; + out1 = in3 ^ in4; + out7 = in1 ^ in2; + tmp0 = in4 ^ in5; + tmp1 = in0 ^ in7; + out4 = in5 ^ in6 ^ in7; + out2 = tmp0 ^ out0 ^ in0; + out3 = tmp0 ^ out7 ^ in6; + out5 = tmp1 ^ in6; + out6 = tmp1 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_65(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + tmp1 = in4 ^ in5; + tmp2 = in6 ^ in7; + out7 = in1 ^ in2 ^ in7; + out1 = in1 ^ in3 ^ in4; + out0 = tmp0 ^ in2; + out2 = tmp0 ^ tmp1; + out4 = tmp1 ^ tmp2; + tmp3 = tmp2 ^ in0; + out3 = out4 ^ out7 ^ in3; + out5 = tmp3 ^ in5; + out6 = tmp3 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_66(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2; + tmp1 = in2 ^ in3; + tmp2 = in0 ^ in4; + out7 = tmp0 ^ in6; + out0 = tmp1 ^ in7; + out1 = tmp2 ^ in3; + tmp3 = tmp2 ^ in6; + tmp4 = out1 ^ in5; + out5 = tmp3 ^ in7; + out4 = tmp3 ^ tmp4; + out2 = tmp0 ^ tmp4 ^ in7; + out6 = tmp1 ^ out2 ^ in4; + out3 = tmp3 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_67(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + tmp1 = tmp0 ^ in1; + tmp2 = tmp0 ^ in7; + out1 = tmp1 ^ in4; + out0 = tmp2 ^ in2; + tmp3 = out1 ^ in7; + out2 = tmp3 ^ in5; + out3 = out2 ^ in0 ^ in6; + out7 = tmp1 ^ out0 ^ in6; + out5 = tmp1 ^ out3; + out4 = tmp2 ^ out5; + out6 = tmp3 ^ out4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_68(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in4; + tmp1 = in2 ^ in3 ^ in5; + tmp2 = tmp0 ^ in1; + tmp3 = tmp0 ^ in6; + out0 = tmp1 ^ in6; + out6 = tmp2 ^ in0; + out7 = tmp1 ^ tmp2; + out1 = tmp3 ^ in7; + out2 = out1 ^ in2; + out4 = tmp2 ^ out2; + out3 = out4 ^ out6 ^ in3; + out5 = tmp3 ^ out3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_69(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in6 ^ in7; + out2 = tmp0 ^ in3 ^ in4; + out1 = out2 ^ in1; + out3 = out2 ^ in0 ^ in2; + out4 = out1 ^ in2 ^ in3; + out6 = out1 ^ in0 ^ in7; + out7 = out4 ^ in5 ^ in6; + out5 = out4 ^ out6 ^ in5; + out0 = tmp0 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_6A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in6; + out3 = in0 ^ in4 ^ in6; + tmp1 = tmp0 ^ in3; + out4 = tmp1 ^ in1; + tmp2 = tmp1 ^ in7; + out2 = out4 ^ in4; + out0 = tmp2 ^ in5; + out5 = tmp2 ^ out3; + out7 = out2 ^ in3 ^ in5; + out1 = tmp0 ^ out5; + out6 = tmp1 ^ out7 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_6B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in4 ^ in6; + out2 = tmp0 ^ in1 ^ in3; + out4 = out2 ^ in2; + tmp1 = out2 ^ in0; + out7 = out4 ^ in3 ^ in5 ^ in7; + out1 = tmp1 ^ in7; + out3 = tmp1 ^ in1; + out6 = tmp1 ^ in5; + out0 = tmp1 ^ out7 ^ in6; + out5 = tmp0 ^ out0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_6C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in1; + tmp0 = in2 ^ in3; + out5 = in0 ^ in2; + out1 = in3 ^ in4 ^ in6; + tmp1 = out5 ^ in1; + out0 = tmp0 ^ in5; + out6 = tmp0 ^ tmp1; + out3 = tmp1 ^ in4; + out7 = out3 ^ in0; + out2 = out6 ^ out7 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_6D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in1 ^ in4; + tmp0 = in0 ^ in2; + tmp1 = out4 ^ in3; + out7 = out4 ^ in2 ^ in7; + out5 = tmp0 ^ in5; + out3 = tmp0 ^ tmp1; + out1 = tmp1 ^ in6; + out0 = out5 ^ in3; + out2 = out3 ^ out7 ^ in4; + out6 = out1 ^ in0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_6E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in3; + tmp1 = in0 ^ in4; + out4 = tmp0 ^ in7; + out6 = tmp0 ^ in0 ^ in5; + out5 = tmp1 ^ in2; + tmp2 = tmp1 ^ in3; + out3 = tmp2 ^ out4; + out1 = tmp2 ^ in6; + out2 = tmp0 ^ out5; + out0 = out2 ^ out3 ^ in5; + out7 = out1 ^ out2 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_6F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in7; + tmp1 = tmp0 ^ in4; + tmp2 = tmp0 ^ in0 ^ in2; + out4 = tmp1 ^ in1; + out0 = tmp2 ^ in5; + out3 = out4 ^ in0; + out2 = out3 ^ in7; + out1 = out2 ^ in6; + out6 = out1 ^ in4 ^ in5; + out7 = tmp2 ^ out1; + out5 = tmp1 ^ out0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_70(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in2; + tmp0 = in2 ^ in4; + out2 = in2 ^ in3 ^ in5; + tmp1 = tmp0 ^ in6; + tmp2 = out2 ^ in7; + out0 = tmp1 ^ in3; + out4 = tmp1 ^ in0; + out7 = tmp2 ^ in1; + out6 = out4 ^ in1; + out5 = out7 ^ in0 ^ in2; + out1 = tmp0 ^ tmp2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_71(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in3 ^ in5; + out3 = in2 ^ in3; + tmp0 = in0 ^ in2; + tmp1 = out2 ^ in1; + out4 = tmp0 ^ in6; + tmp2 = tmp0 ^ in1; + out7 = tmp1 ^ in2; + out1 = tmp1 ^ in4 ^ in7; + out0 = out4 ^ in3 ^ in4; + out6 = tmp2 ^ in4; + out5 = tmp2 ^ out3 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_72(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in7; + tmp0 = in0 ^ in4; + tmp1 = tmp0 ^ in3 ^ in7; + out1 = tmp1 ^ in5; + out5 = out1 ^ in1; + tmp2 = tmp0 ^ out5; + out2 = tmp2 ^ in2; + out7 = out2 ^ in6; + out6 = tmp1 ^ out7; + out4 = tmp2 ^ out6; + out0 = out4 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_73(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in3 ^ in7; + out2 = out3 ^ in1 ^ in5; + out1 = out2 ^ in0 ^ in4; + out5 = out1 ^ in5; + out6 = out1 ^ out3 ^ in2; + out0 = out2 ^ out6 ^ in6; + out7 = out0 ^ out1 ^ in3; + out4 = out0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_74(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in4; + tmp1 = in1 ^ in2 ^ in6; + out4 = in0 ^ in4 ^ in7; + out5 = in0 ^ in1 ^ in5; + out0 = tmp0 ^ in2; + out1 = tmp0 ^ in5; + out3 = tmp1 ^ in7; + out6 = tmp1 ^ in0; + out2 = tmp1 ^ out5 ^ in3; + out7 = out3 ^ in3 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_75(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in0 ^ in7; + tmp0 = in1 ^ in3; + out5 = in0 ^ in1; + out7 = tmp0 ^ in2; + tmp1 = tmp0 ^ in4; + out6 = out5 ^ in2; + tmp2 = out7 ^ in6; + out1 = tmp1 ^ in5; + out0 = tmp1 ^ out6; + out3 = tmp2 ^ in7; + out2 = tmp2 ^ out6 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_76(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in1 ^ in6; + tmp0 = in0 ^ in5; + tmp1 = in3 ^ in7; + tmp2 = tmp0 ^ in4; + tmp3 = tmp1 ^ in2; + out5 = tmp2 ^ in1; + out1 = tmp2 ^ in3; + out0 = tmp3 ^ in4; + out4 = out1 ^ in5; + out7 = tmp3 ^ out3; + out2 = tmp0 ^ out7; + out6 = tmp1 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_77(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in0 ^ in3; + tmp0 = in1 ^ in4; + tmp1 = in1 ^ in6; + tmp2 = out4 ^ in5; + out5 = tmp0 ^ in0; + out1 = tmp0 ^ tmp2; + out3 = tmp1 ^ in3; + out2 = tmp1 ^ tmp2 ^ in7; + out7 = out3 ^ in2; + tmp3 = out7 ^ in6; + out6 = tmp2 ^ tmp3; + out0 = tmp3 ^ out5 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_78(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + tmp1 = in2 ^ in7; + tmp2 = in0 ^ in5 ^ in6; + out2 = tmp1 ^ in3; + out3 = tmp2 ^ in2; + out5 = out3 ^ in1 ^ in3; + out0 = tmp0 ^ out3 ^ in4; + out1 = tmp1 ^ out0; + out4 = out1 ^ out5 ^ in5; + out7 = tmp0 ^ out4; + out6 = tmp2 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_79(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in3 ^ in7; + tmp0 = in3 ^ in4; + tmp1 = in1 ^ in5; + tmp2 = tmp1 ^ in2; + out4 = tmp2 ^ in0 ^ in7; + tmp3 = out4 ^ in5; + out5 = tmp3 ^ out2 ^ in6; + out7 = tmp0 ^ tmp2; + out6 = tmp0 ^ tmp3; + out3 = tmp1 ^ out5; + out0 = out3 ^ in4; + out1 = tmp3 ^ out0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_7A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2; + out2 = tmp0 ^ in3; + tmp1 = out2 ^ in4; + out4 = tmp1 ^ in0 ^ in5; + out5 = out4 ^ in6; + out6 = out5 ^ in7; + out7 = out6 ^ in0; + out0 = out7 ^ in1; + out1 = tmp0 ^ out6; + out3 = tmp1 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_7B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in1 ^ in3; + tmp0 = in0 ^ in5; + out4 = tmp0 ^ out2 ^ in2; + tmp1 = out4 ^ in4; + out6 = tmp1 ^ in7; + out5 = tmp1 ^ in5 ^ in6; + out0 = out6 ^ in1 ^ in6; + tmp2 = out0 ^ in2; + out1 = tmp2 ^ in1; + out3 = tmp2 ^ in4; + out7 = tmp0 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_7C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in5; + tmp1 = tmp0 ^ in4; + out0 = tmp1 ^ in2; + out1 = tmp1 ^ in6; + out7 = out0 ^ in1 ^ in5 ^ in7; + out5 = out1 ^ out7 ^ in0; + out3 = out5 ^ in6; + out6 = tmp0 ^ out5; + out2 = out6 ^ in1; + out4 = out2 ^ out7 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_7D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2; + tmp1 = tmp0 ^ in3; + tmp2 = tmp0 ^ in6; + out7 = tmp1 ^ in4; + tmp3 = tmp2 ^ in0; + out5 = tmp3 ^ in7; + out4 = tmp3 ^ in2 ^ in5; + out2 = tmp1 ^ out5; + out6 = tmp2 ^ out2; + out0 = out4 ^ out7 ^ in6; + out1 = tmp3 ^ out0; + out3 = out6 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_7E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in4; + tmp1 = in0 ^ in5; + out1 = tmp0 ^ tmp1 ^ in6; + out3 = tmp1 ^ in1; + out4 = out1 ^ in1 ^ in7; + tmp2 = out4 ^ in3; + out5 = tmp2 ^ in2; + out6 = tmp0 ^ out5; + out7 = tmp1 ^ out4 ^ in2; + out2 = out6 ^ in5 ^ in7; + out0 = tmp2 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_7F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in7; + tmp1 = tmp0 ^ in3 ^ in5; + tmp2 = tmp1 ^ in0; + out0 = tmp2 ^ in4; + out6 = tmp2 ^ in1; + out3 = tmp0 ^ out6; + tmp3 = out3 ^ in6; + out1 = tmp3 ^ in4; + out2 = tmp3 ^ in5; + out4 = tmp3 ^ in7; + out5 = tmp1 ^ out1; + out7 = out0 ^ out4 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_80(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3; + tmp1 = in4 ^ in5; + out1 = in2 ^ in6 ^ in7; + out5 = tmp0 ^ in4; + tmp2 = tmp0 ^ in1; + out6 = tmp1 ^ in3; + out7 = tmp1 ^ in0 ^ in6; + out4 = tmp2 ^ in7; + out3 = tmp2 ^ out6; + out2 = out3 ^ out5 ^ in6; + out0 = out2 ^ in3 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_81(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in4 ^ in6; + tmp1 = tmp0 ^ in3; + out6 = tmp1 ^ in5; + out5 = out6 ^ in2 ^ in6; + out3 = out5 ^ in1; + out2 = tmp0 ^ out3; + out1 = out3 ^ out6 ^ in7; + out4 = tmp1 ^ out1; + out7 = out2 ^ out4 ^ in0; + out0 = out7 ^ in1 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_82(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in1 ^ in2; + tmp0 = in6 ^ in7; + out5 = in2 ^ in3; + out6 = in3 ^ in4; + out7 = in0 ^ in4 ^ in5; + out0 = in1 ^ in5 ^ in6; + out1 = tmp0 ^ in0 ^ in2; + out2 = tmp0 ^ in3 ^ in5; + out3 = tmp0 ^ out0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_83(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = in2 ^ in5; + tmp2 = in3 ^ in6; + out4 = in1 ^ in2 ^ in4; + out0 = tmp0 ^ in5 ^ in6; + out5 = tmp1 ^ in3; + tmp3 = tmp1 ^ in7; + out6 = tmp2 ^ in4; + out2 = tmp2 ^ tmp3; + tmp4 = tmp3 ^ out4; + out1 = tmp3 ^ out0; + out3 = tmp4 ^ in3; + out7 = tmp0 ^ tmp4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_84(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in2 ^ in6; + out6 = in3 ^ in5; + out0 = in1 ^ in5 ^ in7; + out7 = in0 ^ in4 ^ in6; + out4 = in1 ^ in3 ^ in6; + out5 = in2 ^ in4 ^ in7; + out2 = out6 ^ in0 ^ in1; + out3 = out5 ^ in5 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_85(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in6; + tmp1 = in3 ^ in6; + tmp2 = tmp0 ^ in4; + out1 = tmp0 ^ in2; + out6 = tmp1 ^ in5; + out4 = tmp2 ^ in3; + tmp3 = out1 ^ out6; + out2 = tmp3 ^ in0; + out3 = tmp2 ^ tmp3 ^ in7; + out7 = out2 ^ out3 ^ in1; + out5 = tmp1 ^ out3; + out0 = tmp2 ^ out7 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_86(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out6 = in3; + out7 = in0 ^ in4; + out0 = in1 ^ in5; + out5 = in2 ^ in7; + out3 = in4 ^ in5 ^ in6; + out1 = in0 ^ in2 ^ in6; + out4 = in1 ^ in6 ^ in7; + out2 = in0 ^ in3 ^ in5 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_87(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out6 = in3 ^ in6; + tmp0 = in0 ^ in1; + out7 = in0 ^ in4 ^ in7; + out5 = in2 ^ in5 ^ in7; + out3 = out6 ^ in4 ^ in5; + out0 = tmp0 ^ in5; + tmp1 = tmp0 ^ in6; + out2 = out5 ^ in0 ^ in3; + out1 = tmp1 ^ in2; + out4 = tmp1 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_88(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in2 ^ in7; + tmp0 = in5 ^ in6; + out0 = in1 ^ in6 ^ in7; + out6 = in4 ^ in5 ^ in7; + out3 = out0 ^ out1 ^ in0 ^ in4; + out7 = tmp0 ^ in0; + tmp1 = tmp0 ^ in3; + out2 = out0 ^ in3; + out4 = tmp1 ^ in2; + out5 = tmp1 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_89(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in7; + tmp1 = in2 ^ in7; + tmp2 = tmp0 ^ in6; + out1 = tmp1 ^ in1; + out7 = tmp2 ^ in5; + out0 = tmp2 ^ in1; + out2 = out1 ^ in3 ^ in6; + out6 = out7 ^ in0 ^ in4; + out5 = out6 ^ in3; + out3 = tmp0 ^ out2 ^ in4; + out4 = tmp1 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_8A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in1 ^ in6; + out7 = in0 ^ in5; + out2 = in3 ^ in6; + out6 = in4 ^ in7; + out1 = in0 ^ in2 ^ in7; + out3 = out0 ^ out6 ^ in0; + out4 = out1 ^ out7 ^ in6; + out5 = out2 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_8B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = in3 ^ in6; + tmp2 = in5 ^ in7; + tmp3 = tmp0 ^ in7; + out0 = tmp0 ^ in6; + out2 = tmp1 ^ in2; + out5 = tmp1 ^ tmp2; + out7 = tmp2 ^ in0; + tmp4 = tmp3 ^ in4; + out1 = tmp3 ^ in2; + out6 = tmp4 ^ out0; + out4 = out6 ^ in2 ^ in5; + out3 = tmp1 ^ tmp4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_8C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in2; + out0 = in1 ^ in7; + out7 = in0 ^ in6; + out5 = in4 ^ in6; + out6 = in5 ^ in7; + out2 = out0 ^ in0 ^ in3; + out3 = out5 ^ out7 ^ in2 ^ in7; + out4 = out6 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_8D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in1 ^ in2; + tmp0 = in6 ^ in7; + out0 = in0 ^ in1 ^ in7; + out5 = in4 ^ in5 ^ in6; + out6 = tmp0 ^ in5; + out7 = tmp0 ^ in0; + out4 = tmp0 ^ out5 ^ in3; + out2 = out0 ^ in2 ^ in3; + out3 = out2 ^ in1 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_8E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in1; + out4 = in5; + out7 = in0; + out5 = in6; + out6 = in7; + out3 = in0 ^ in4; + out1 = in0 ^ in2; + out2 = in0 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_8F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in0 ^ in1; + tmp0 = in0 ^ in3; + out4 = in4 ^ in5; + out7 = in0 ^ in7; + out5 = in5 ^ in6; + out6 = in6 ^ in7; + out1 = out0 ^ in2; + out2 = tmp0 ^ in2; + out3 = tmp0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_90(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2; + tmp1 = in2 ^ in6 ^ in7; + out3 = tmp0 ^ in7; + out1 = tmp1 ^ in5; + tmp2 = out1 ^ in4; + out6 = tmp2 ^ in3; + out5 = out6 ^ in1; + out4 = out5 ^ in0; + out0 = tmp0 ^ tmp2; + out7 = tmp0 ^ out4; + out2 = tmp1 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_91(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in4; + tmp1 = tmp0 ^ in3 ^ in5; + out2 = tmp1 ^ in1; + out6 = tmp1 ^ in7; + tmp2 = out2 ^ in5 ^ in7; + out3 = tmp2 ^ in4; + out5 = tmp2 ^ in6; + out1 = tmp1 ^ out5 ^ in2; + tmp3 = out1 ^ in0; + out4 = tmp3 ^ in3; + out0 = tmp0 ^ tmp3; + out7 = tmp2 ^ tmp3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_92(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in1; + tmp0 = in4 ^ in5; + tmp1 = tmp0 ^ in1; + out2 = tmp0 ^ in3 ^ in7; + out0 = tmp1 ^ in6; + out7 = out2 ^ in0; + out4 = out0 ^ in0 ^ in2; + out5 = out4 ^ out7 ^ in5; + out6 = tmp1 ^ out5; + out1 = out6 ^ out7 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_93(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in1 ^ in3; + tmp0 = in2 ^ in7; + tmp1 = out3 ^ in6; + tmp2 = tmp0 ^ in4; + out5 = tmp0 ^ tmp1; + out6 = tmp2 ^ in3; + out2 = out6 ^ in5; + out0 = out2 ^ out5 ^ in0; + out7 = tmp1 ^ out0; + out1 = tmp2 ^ out0; + out4 = out1 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_94(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in2 ^ in6; + tmp0 = in1 ^ in4 ^ in5; + out1 = out3 ^ in5; + out5 = tmp0 ^ out3; + out0 = tmp0 ^ in7; + out4 = tmp0 ^ in0 ^ in3; + out6 = out1 ^ in3 ^ in7; + out2 = out4 ^ in6; + out7 = out0 ^ out2 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_95(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3; + out3 = tmp0 ^ in6; + tmp1 = tmp0 ^ in7; + tmp2 = out3 ^ in0; + out6 = tmp1 ^ in5; + tmp3 = tmp2 ^ in4; + out7 = tmp3 ^ in2; + tmp4 = tmp3 ^ in5; + out2 = tmp4 ^ in1; + tmp5 = out2 ^ in6; + out0 = tmp1 ^ tmp5; + out1 = tmp5 ^ out7; + out4 = tmp2 ^ out1; + out5 = tmp4 ^ out4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_96(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in6 ^ in7; + tmp0 = in1 ^ in5; + tmp1 = in5 ^ in6; + out6 = out3 ^ in2 ^ in3; + out0 = tmp0 ^ in4; + tmp2 = tmp1 ^ in2; + out4 = out0 ^ in0 ^ in7; + out1 = tmp2 ^ in0; + out5 = tmp2 ^ in1; + out7 = tmp0 ^ out4 ^ in3; + out2 = tmp1 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_97(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in4; + tmp1 = in2 ^ in6; + out3 = in3 ^ in6 ^ in7; + out7 = tmp0 ^ in3; + tmp2 = tmp0 ^ in5; + out5 = tmp1 ^ in1; + out6 = tmp1 ^ out3; + out0 = tmp2 ^ in1; + out2 = tmp2 ^ out3 ^ in2; + tmp3 = out0 ^ in4; + out4 = tmp3 ^ in7; + out1 = tmp1 ^ tmp3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_98(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in5 ^ in7; + tmp1 = in1 ^ in4 ^ in7; + out1 = tmp0 ^ in2; + out0 = tmp1 ^ in6; + out2 = tmp1 ^ in3; + out6 = out0 ^ out1 ^ in1; + out5 = tmp0 ^ out2; + out3 = tmp1 ^ out6 ^ in0; + out7 = out0 ^ out5 ^ in0; + out4 = out6 ^ out7 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_99(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + out5 = in1 ^ in3 ^ in4; + out6 = in2 ^ in4 ^ in5; + out4 = tmp0 ^ in2; + tmp1 = tmp0 ^ in6; + tmp2 = out5 ^ in7; + out7 = tmp1 ^ in5; + out0 = tmp1 ^ tmp2; + out2 = tmp2 ^ in2; + out3 = out0 ^ out6 ^ in3; + out1 = tmp1 ^ out3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_9A(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in3 ^ in4; + tmp0 = in0 ^ in5; + tmp1 = in1 ^ in6; + out5 = in1 ^ in3 ^ in5; + tmp2 = tmp0 ^ in7; + out3 = tmp0 ^ tmp1; + out0 = tmp1 ^ in4; + out7 = tmp2 ^ in3; + out1 = tmp2 ^ in2; + out6 = out0 ^ in1 ^ in2; + out4 = out1 ^ in4 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_9B(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out5 = in1 ^ in3; + tmp0 = in3 ^ in5; + out6 = in2 ^ in4; + out4 = in0 ^ in2 ^ in7; + out7 = tmp0 ^ in0; + out2 = out6 ^ in3; + out1 = out4 ^ in1 ^ in5; + out3 = out7 ^ in1 ^ in6; + out0 = tmp0 ^ out3 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_9C(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out1 = in2 ^ in5; + tmp0 = in0 ^ in3 ^ in6; + out3 = out1 ^ in0; + out6 = out1 ^ in6; + out7 = tmp0 ^ in7; + out4 = out7 ^ in4; + out2 = out4 ^ in1; + out0 = tmp0 ^ out2; + out5 = out0 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_9D(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out6 = in2 ^ in5; + tmp0 = in0 ^ in3; + out5 = in1 ^ in4 ^ in7; + out1 = out6 ^ in1; + out3 = tmp0 ^ out6; + out7 = tmp0 ^ in6; + out0 = out5 ^ in0; + out4 = out7 ^ in7; + out2 = out5 ^ out7 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_9E(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in1 ^ in4; + tmp0 = in0 ^ in5; + out6 = in2 ^ in6; + out7 = in0 ^ in3 ^ in7; + out4 = in0 ^ in4 ^ in6; + out5 = in1 ^ in5 ^ in7; + out1 = tmp0 ^ in2; + out3 = tmp0 ^ in7; + out2 = out4 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_9F(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out6 = in2; + out7 = in0 ^ in3; + tmp0 = in0 ^ in1; + out4 = in0 ^ in6; + out5 = in1 ^ in7; + out1 = tmp0 ^ in2 ^ in5; + out2 = out7 ^ in2 ^ in4 ^ in6; + out3 = out7 ^ in5 ^ in7; + out0 = tmp0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_A0(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in6; + out2 = tmp0 ^ in7; + tmp1 = tmp0 ^ in5; + out6 = out2 ^ in3 ^ in4; + out0 = tmp1 ^ in3; + tmp2 = out0 ^ in2; + out3 = tmp2 ^ in7; + tmp3 = tmp2 ^ in1; + out5 = tmp3 ^ in0; + out4 = tmp3 ^ out6; + out7 = out5 ^ out6 ^ in1; + out1 = tmp1 ^ out4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_A1(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in5; + tmp1 = tmp0 ^ in1; + tmp2 = tmp0 ^ in4; + out4 = tmp1 ^ in7; + out7 = tmp2 ^ in0; + out6 = tmp2 ^ out4 ^ in3; + out3 = out4 ^ in6; + out2 = out3 ^ in5; + out1 = out2 ^ in4; + out5 = out1 ^ out6 ^ in0; + out0 = tmp1 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_A2(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in6; + tmp0 = in1 ^ in3 ^ in5; + out3 = tmp0 ^ in6; + out4 = tmp0 ^ in2 ^ in4; + out0 = out3 ^ in7; + out6 = out0 ^ in4; + out1 = out0 ^ out4 ^ in0; + out7 = out1 ^ in5; + out5 = out7 ^ in3 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_A3(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in2 ^ in6; + out3 = in1 ^ in5 ^ in6; + tmp0 = out2 ^ in0; + out4 = out2 ^ out3 ^ in3; + tmp1 = tmp0 ^ in4; + out0 = tmp0 ^ out4 ^ in7; + out5 = tmp1 ^ in3; + out7 = tmp1 ^ in5; + out1 = tmp1 ^ in1 ^ in7; + out6 = tmp1 ^ out0 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_A4(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in3; + tmp1 = in2 ^ in4; + tmp2 = in2 ^ in5; + tmp3 = in0 ^ in7; + out0 = tmp0 ^ in5; + out6 = tmp0 ^ in6 ^ in7; + out1 = tmp1 ^ in6; + out7 = tmp1 ^ tmp3; + out3 = tmp2 ^ in3; + tmp4 = tmp2 ^ out1; + out2 = tmp3 ^ in1; + out5 = tmp4 ^ out7; + out4 = tmp4 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_A5(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in2 ^ in5; + tmp0 = in1 ^ in6; + tmp1 = in0 ^ in1; + tmp2 = in2 ^ in4; + out6 = in1 ^ in3 ^ in7; + out4 = tmp0 ^ in5; + out1 = tmp0 ^ tmp2; + out0 = tmp1 ^ in3 ^ in5; + out2 = tmp1 ^ in2 ^ in7; + out7 = tmp2 ^ in0; + out5 = tmp0 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_A6(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in0; + out3 = in3 ^ in5 ^ in7; + out1 = in0 ^ in2 ^ in4 ^ in6; + out0 = out3 ^ in1; + out7 = out1 ^ in7; + out6 = out0 ^ in6; + out5 = out7 ^ in5; + out4 = out6 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_A7(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in0 ^ in2; + out3 = in5 ^ in7; + out7 = out2 ^ in4 ^ in6; + out6 = out3 ^ in1 ^ in3; + out1 = out7 ^ in1; + out5 = out7 ^ in7; + out0 = out6 ^ in0; + out4 = out6 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_A8(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in4; + tmp1 = in1 ^ in6; + tmp2 = in0 ^ in2 ^ in7; + out1 = tmp0 ^ in7; + out4 = tmp0 ^ in6; + out0 = tmp1 ^ in3; + out2 = tmp1 ^ in5; + out6 = tmp1 ^ in4; + out7 = tmp2 ^ in5; + out3 = tmp2 ^ out0 ^ in6; + out5 = out7 ^ in2 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_A9(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in2 ^ in6; + out6 = in1 ^ in4; + out7 = in0 ^ in2 ^ in5; + out5 = in0 ^ in3 ^ in7; + out2 = out4 ^ in1 ^ in5; + out1 = out6 ^ in2 ^ in7; + out0 = out2 ^ out7 ^ in3; + out3 = out1 ^ in0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_AA(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in2; + tmp1 = in1 ^ in3; + tmp2 = in6 ^ in7; + out1 = tmp0 ^ in4 ^ in7; + out3 = tmp1 ^ in0; + out0 = tmp1 ^ tmp2; + out2 = tmp2 ^ in5; + out7 = tmp0 ^ out2; + out6 = out1 ^ out7 ^ in1; + out5 = out0 ^ out6 ^ in0; + out4 = out5 ^ out7 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_AB(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in0 ^ in1; + tmp0 = in1 ^ in4; + tmp1 = in0 ^ in7; + out6 = tmp0 ^ in5; + out1 = tmp0 ^ tmp1 ^ in2; + out5 = tmp1 ^ in3 ^ in4; + out0 = tmp0 ^ out5 ^ in6; + out4 = out0 ^ out3 ^ in2; + out2 = out4 ^ in3 ^ in5; + out7 = tmp1 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_AC(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in1 ^ in3; + out1 = in2 ^ in4; + tmp0 = in0 ^ in2; + out4 = in4 ^ in7; + out5 = in0 ^ in5; + out6 = in1 ^ in6; + out7 = tmp0 ^ in7; + out3 = tmp0 ^ in3 ^ in6; + out2 = out5 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_AD(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in7; + out5 = in0; + out6 = in1; + out7 = in0 ^ in2; + out0 = in0 ^ in1 ^ in3; + out2 = out7 ^ in1 ^ in5; + out1 = in1 ^ in2 ^ in4; + out3 = out7 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_AE(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in3 ^ in4; + tmp0 = in0 ^ in4; + tmp1 = in0 ^ in7; + out0 = in1 ^ in3 ^ in7; + out1 = tmp0 ^ in2; + out5 = tmp0 ^ in5; + tmp2 = tmp1 ^ in6; + out2 = tmp1 ^ in5; + out3 = tmp2 ^ in3; + out7 = tmp2 ^ in2; + out6 = tmp2 ^ out2 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_AF(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in3; + tmp0 = in0 ^ in7; + out5 = in0 ^ in4; + out6 = in1 ^ in5; + out7 = in0 ^ in2 ^ in6; + out0 = tmp0 ^ in1 ^ in3; + out3 = tmp0 ^ in6; + out2 = tmp0 ^ in2 ^ in5; + out1 = out5 ^ in1 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_B0(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in4; + tmp1 = in3 ^ in6; + out2 = tmp0 ^ in7; + tmp2 = tmp0 ^ tmp1; + out0 = tmp2 ^ in5; + out3 = tmp2 ^ in2; + out6 = out3 ^ in6; + tmp3 = out6 ^ in0 ^ in1; + out7 = tmp3 ^ in5; + out5 = tmp3 ^ out2; + out1 = out0 ^ out5 ^ in0; + out4 = tmp1 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_B1(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in4; + out2 = tmp0 ^ in2 ^ in7; + tmp1 = out2 ^ in6; + out1 = tmp1 ^ in5; + out3 = tmp1 ^ in7; + out4 = tmp1 ^ in0; + out6 = out3 ^ in3; + out0 = out6 ^ in0 ^ in2 ^ in5; + out5 = tmp1 ^ out0 ^ in1; + out7 = tmp0 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_B2(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in4; + tmp0 = in4 ^ in7; + tmp1 = in1 ^ in3 ^ in6; + out3 = tmp0 ^ tmp1; + tmp2 = tmp1 ^ in0; + out0 = out3 ^ in5; + out4 = tmp2 ^ in2; + tmp3 = out4 ^ in6; + out5 = tmp0 ^ tmp3; + out1 = tmp3 ^ out0; + tmp4 = out1 ^ in7; + out7 = tmp4 ^ in3; + out6 = tmp2 ^ tmp4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_B3(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in2 ^ in4; + tmp0 = in0 ^ in5; + tmp1 = in1 ^ in6; + out3 = tmp1 ^ in4 ^ in7; + tmp2 = tmp0 ^ out3; + out0 = tmp2 ^ in3; + out1 = tmp2 ^ in2; + out5 = out0 ^ in2 ^ in6; + out7 = tmp1 ^ out5; + out4 = out7 ^ in1 ^ in5 ^ in7; + out6 = tmp0 ^ out4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_B4(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in0 ^ in1; + out5 = out4 ^ in2; + tmp0 = out4 ^ in4; + out6 = out5 ^ in0 ^ in3; + out7 = tmp0 ^ out6; + out2 = tmp0 ^ in6 ^ in7; + out3 = out7 ^ in0 ^ in7; + out0 = out5 ^ out7 ^ in5; + out1 = out0 ^ out6 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_B5(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = in2 ^ in4; + out4 = tmp0 ^ in4; + out3 = tmp1 ^ in7; + tmp2 = out4 ^ in5; + out7 = out3 ^ in0 ^ in3; + out0 = tmp2 ^ in3; + out2 = tmp0 ^ out3 ^ in6; + out5 = tmp1 ^ tmp2; + out6 = out2 ^ out7 ^ in2; + out1 = tmp0 ^ out0 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_B6(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in3 ^ in4; + tmp0 = in1 ^ in2; + tmp1 = in0 ^ in4; + tmp2 = in3 ^ in5; + tmp3 = out3 ^ in1 ^ in7; + out5 = tmp0 ^ tmp1; + out6 = tmp0 ^ tmp2; + out2 = tmp1 ^ in6; + out4 = tmp1 ^ tmp3; + out0 = tmp3 ^ in5; + out1 = out2 ^ in2 ^ in5; + out7 = tmp2 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_B7(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in4; + tmp0 = in0 ^ in4; + out2 = tmp0 ^ in2 ^ in6; + tmp1 = out2 ^ in7; + out1 = out2 ^ in1 ^ in5; + out7 = tmp1 ^ in3; + out5 = out1 ^ in6; + out6 = tmp0 ^ out1 ^ in3; + out0 = tmp1 ^ out6; + out4 = out0 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_B8(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in4; + tmp1 = in2 ^ in5; + out2 = tmp0 ^ in5; + out4 = tmp1 ^ in0; + tmp2 = tmp1 ^ in7; + out6 = tmp2 ^ out2; + out7 = out4 ^ in3; + out1 = tmp2 ^ in4; + out3 = tmp0 ^ out7; + out0 = out3 ^ out4 ^ in6; + out5 = out0 ^ in0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_B9(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in2; + tmp1 = in4 ^ in5; + out4 = tmp0 ^ tmp1; + tmp2 = tmp0 ^ in3 ^ in7; + out3 = out4 ^ in1; + out7 = tmp2 ^ in5; + out2 = out3 ^ in0; + out1 = out2 ^ in7; + out6 = out1 ^ in5 ^ in6; + out0 = tmp2 ^ out6; + out5 = tmp1 ^ out0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_BA(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in5 ^ in7; + out2 = tmp0 ^ in4; + tmp1 = out2 ^ in2; + out1 = tmp1 ^ in0; + out6 = tmp1 ^ in1; + out4 = out1 ^ in3 ^ in4; + tmp2 = out4 ^ out6; + out7 = out4 ^ in6 ^ in7; + out5 = tmp2 ^ in6; + out3 = tmp0 ^ tmp2; + out0 = out6 ^ out7 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_BB(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in2 ^ in4 ^ in5 ^ in7; + tmp0 = out2 ^ in1; + out4 = out2 ^ in0 ^ in3; + out1 = tmp0 ^ in0; + out6 = tmp0 ^ in6; + out3 = out1 ^ in2; + tmp1 = out4 ^ out6 ^ in4; + out0 = tmp1 ^ in7; + out5 = tmp1 ^ in5; + out7 = tmp0 ^ tmp1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_BC(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in2; + tmp1 = in2 ^ in4; + out0 = in1 ^ in3 ^ in4; + out6 = in1 ^ in2 ^ in7; + out7 = tmp0 ^ in3; + out5 = tmp0 ^ out6 ^ in6; + out1 = tmp1 ^ in5; + tmp2 = out1 ^ out5 ^ in1; + out3 = tmp2 ^ in3; + out4 = tmp1 ^ tmp2; + out2 = tmp2 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_BD(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + tmp1 = in1 ^ in4; + out0 = tmp0 ^ tmp1; + out7 = tmp0 ^ in2 ^ in7; + out1 = tmp1 ^ in2 ^ in5; + tmp2 = out1 ^ in0; + out2 = tmp2 ^ in6; + out3 = out2 ^ in1 ^ in7; + out4 = out3 ^ in2; + out5 = tmp1 ^ out4; + out6 = tmp2 ^ out4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_BE(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3 ^ in6; + out4 = tmp0 ^ in5; + out7 = tmp0 ^ in2; + out3 = out4 ^ in4; + out1 = out3 ^ out7 ^ in0; + out2 = out3 ^ in3 ^ in7; + out0 = out2 ^ out4 ^ in1; + out5 = tmp0 ^ out0; + out6 = out1 ^ out5 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_BF(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in4; + out3 = tmp0 ^ in5 ^ in6; + out4 = out3 ^ in3; + tmp1 = out3 ^ in7; + out2 = tmp1 ^ in2; + out5 = tmp1 ^ in1; + tmp2 = out2 ^ in5; + out7 = tmp2 ^ in3 ^ in4; + tmp3 = tmp0 ^ out5; + out0 = tmp3 ^ out4; + out1 = tmp2 ^ tmp3; + out6 = tmp3 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_C0(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out5 = in2 ^ in5; + tmp0 = in1 ^ in4; + tmp1 = in3 ^ in6; + out0 = out5 ^ in1; + out4 = tmp0 ^ in7; + out3 = tmp0 ^ tmp1; + out1 = tmp1 ^ in2; + out6 = tmp1 ^ in0; + out7 = out4 ^ in0; + out2 = out4 ^ out5 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_C1(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out5 = in2; + tmp0 = in0 ^ in1; + out4 = in1 ^ in7; + out6 = in0 ^ in3; + out3 = in1 ^ in4 ^ in6; + tmp1 = tmp0 ^ in2; + out7 = tmp0 ^ in4; + out0 = tmp1 ^ in5; + out1 = tmp1 ^ out6 ^ in6; + out2 = out6 ^ out7 ^ in5 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_C2(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in1 ^ in3 ^ in4; + tmp0 = in0 ^ in3 ^ in6; + out5 = in2 ^ in4 ^ in5; + tmp1 = out4 ^ in7; + out1 = tmp0 ^ in2; + out6 = tmp0 ^ in5; + out2 = out5 ^ in3; + out7 = tmp0 ^ tmp1; + out3 = tmp1 ^ in2 ^ in6; + out0 = tmp1 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_C3(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in1 ^ in3; + tmp0 = in0 ^ in2; + tmp1 = in3 ^ in5; + out5 = in2 ^ in4; + tmp2 = tmp0 ^ out4; + out2 = tmp1 ^ in4; + out6 = tmp1 ^ in0; + out0 = tmp1 ^ tmp2 ^ in7; + out1 = tmp2 ^ in6; + out7 = out1 ^ out5 ^ in3; + out3 = tmp0 ^ out7 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_C4(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in7; + out3 = tmp0 ^ in4; + tmp1 = tmp0 ^ in2; + out1 = tmp1 ^ in6; + out5 = tmp1 ^ in5; + out4 = out1 ^ out3 ^ in1; + out0 = out4 ^ in4 ^ in5; + out2 = out0 ^ out3 ^ in0; + out7 = out1 ^ out2 ^ in7; + out6 = tmp1 ^ out0 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_C5(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in4 ^ in7; + tmp0 = in3 ^ in7; + out4 = in1 ^ in2 ^ in6; + out6 = in0 ^ in3 ^ in4; + out5 = tmp0 ^ in2; + out1 = tmp0 ^ out4; + out0 = out4 ^ in0 ^ in5; + out2 = out0 ^ out5 ^ in4; + out7 = tmp0 ^ out2 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_C6(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in5 ^ in6; + tmp1 = in1 ^ in7; + tmp2 = tmp0 ^ in0; + tmp3 = tmp0 ^ tmp1; + tmp4 = tmp2 ^ in4; + out0 = tmp3 ^ in2; + out6 = tmp4 ^ in3; + out2 = out6 ^ in2; + out7 = tmp1 ^ tmp4; + out3 = tmp2 ^ out2; + tmp5 = out3 ^ in5; + out5 = tmp5 ^ in7; + out4 = tmp3 ^ tmp5; + out1 = tmp4 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_C7(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in2 ^ in4; + tmp0 = in3 ^ in5; + tmp1 = out3 ^ in7; + out6 = tmp0 ^ in0 ^ in4; + out5 = tmp1 ^ in3; + out2 = out6 ^ in6; + out7 = out2 ^ in1 ^ in3; + out0 = tmp1 ^ out7; + out1 = tmp0 ^ out0; + out4 = out1 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_C8(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out0 = in1 ^ in2; + out1 = in2 ^ in3; + tmp0 = in5 ^ in6; + tmp1 = in0 ^ in7; + out2 = out1 ^ in1 ^ in4; + out4 = tmp0 ^ in4; + out5 = tmp0 ^ in7; + out6 = tmp1 ^ in6; + out7 = tmp1 ^ in1; + out3 = out2 ^ in0 ^ in2 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_C9(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in5 ^ in6; + out7 = in0 ^ in1; + tmp0 = in1 ^ in3; + out5 = in6 ^ in7; + out6 = in0 ^ in7; + out0 = out7 ^ in2; + out3 = out7 ^ in4 ^ in5; + out1 = tmp0 ^ in2; + out2 = tmp0 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_CA(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in7; + tmp1 = in2 ^ in7; + tmp2 = tmp0 ^ in6; + out0 = tmp1 ^ in1; + tmp3 = tmp1 ^ in3; + out6 = tmp2 ^ in5; + out7 = tmp2 ^ in1; + out2 = tmp3 ^ in4; + out5 = out6 ^ in0 ^ in4; + out4 = out5 ^ in3; + out1 = tmp0 ^ tmp3; + out3 = tmp3 ^ out5 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_CB(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in4 ^ in7; + tmp1 = in5 ^ in7; + out7 = in0 ^ in1 ^ in6; + out5 = tmp0 ^ in6; + out2 = tmp0 ^ in3; + out6 = tmp1 ^ in0; + out4 = tmp1 ^ in3 ^ in6; + tmp2 = out5 ^ out7 ^ in2; + out1 = tmp2 ^ out2; + out0 = tmp2 ^ in4; + out3 = tmp2 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_CC(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in5; + tmp1 = in1 ^ in6; + out1 = in2 ^ in3 ^ in7; + out5 = tmp0 ^ in6; + out0 = tmp1 ^ in2; + tmp2 = out5 ^ in0 ^ in7; + out3 = tmp2 ^ in4; + out6 = tmp0 ^ out3; + out7 = tmp1 ^ tmp2 ^ in3; + tmp3 = out1 ^ out6; + out4 = tmp2 ^ tmp3; + out2 = tmp3 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_CD(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out5 = in3 ^ in6; + tmp0 = in0 ^ in1; + tmp1 = in2 ^ in7; + out6 = in0 ^ in4 ^ in7; + out2 = tmp0 ^ out5 ^ in4; + out7 = tmp0 ^ in5; + out0 = tmp0 ^ in2 ^ in6; + out4 = tmp1 ^ in5; + out1 = tmp1 ^ in1 ^ in3; + out3 = out6 ^ in5 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_CE(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in5; + tmp1 = tmp0 ^ in3; + out4 = tmp1 ^ in4; + tmp2 = out4 ^ in6; + out3 = tmp2 ^ in0; + out5 = tmp2 ^ in2; + out2 = out3 ^ in5 ^ in7; + out6 = tmp1 ^ out2; + out7 = out2 ^ out4 ^ in1; + out1 = tmp2 ^ out6; + out0 = tmp0 ^ out7 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_CF(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in6; + tmp1 = in0 ^ in1 ^ in5; + out4 = in2 ^ in3 ^ in5; + out5 = tmp0 ^ in4; + out7 = tmp1 ^ in6; + out1 = tmp1 ^ out4 ^ in7; + tmp2 = out5 ^ in0; + out2 = tmp2 ^ in7; + out3 = tmp2 ^ out4; + out6 = tmp0 ^ out2 ^ in5; + out0 = tmp0 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_D0(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + tmp1 = in1 ^ in4; + tmp2 = in2 ^ in5; + out7 = tmp0 ^ tmp1; + out0 = tmp1 ^ tmp2; + tmp3 = tmp2 ^ in3; + out1 = tmp3 ^ in6; + tmp4 = out1 ^ in1; + out2 = tmp4 ^ in7; + out3 = out2 ^ in2; + out4 = tmp0 ^ out3; + out5 = tmp3 ^ out3; + out6 = tmp4 ^ out4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_D1(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in5 ^ in6; + tmp1 = tmp0 ^ in1; + out1 = tmp1 ^ in2; + out2 = tmp1 ^ in7; + out3 = out2 ^ in3; + out5 = out3 ^ in2; + tmp2 = out3 ^ in0; + out4 = tmp2 ^ in4; + out7 = tmp0 ^ out4; + out6 = tmp2 ^ out1 ^ in6; + out0 = out2 ^ out6 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_D2(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in5 ^ in6; + out2 = tmp0 ^ in2 ^ in3; + out1 = out2 ^ in0; + out3 = out2 ^ in1; + out4 = out1 ^ in1 ^ in2; + out6 = out1 ^ in6 ^ in7; + out7 = out4 ^ in4 ^ in5; + out5 = out4 ^ out6 ^ in4; + out0 = tmp0 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_D3(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in3 ^ in5 ^ in6; + tmp0 = out2 ^ in2; + tmp1 = tmp0 ^ in1; + out1 = tmp1 ^ in0; + out3 = tmp1 ^ in3; + out4 = out1 ^ in2 ^ in4; + tmp2 = out4 ^ in5; + out7 = tmp2 ^ in7; + out0 = tmp0 ^ out7; + tmp3 = out0 ^ in0; + out5 = tmp3 ^ in6; + out6 = tmp2 ^ tmp3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_D4(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in3 ^ in5; + tmp0 = in1 ^ in5; + tmp1 = tmp0 ^ in2; + out4 = tmp1 ^ in0; + tmp2 = tmp1 ^ in6; + out2 = out4 ^ in3 ^ in7; + out0 = tmp2 ^ in4; + out5 = tmp2 ^ out3; + out1 = tmp0 ^ out5 ^ in7; + out6 = tmp0 ^ out2 ^ in4; + out7 = tmp1 ^ out6 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_D5(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in5; + tmp0 = in0 ^ in4; + tmp1 = tmp0 ^ in1 ^ in5; + out4 = tmp1 ^ in2; + out0 = out4 ^ in6; + tmp2 = tmp0 ^ out0; + out5 = tmp2 ^ in3; + out1 = out5 ^ in7; + out6 = tmp1 ^ out1; + out7 = tmp2 ^ out6; + out2 = out7 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_D6(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2 ^ in4 ^ in6; + out5 = tmp0 ^ in3; + out0 = tmp0 ^ in5 ^ in7; + out3 = out0 ^ out5 ^ in2; + tmp1 = out3 ^ in0; + out1 = tmp1 ^ in6; + out2 = tmp1 ^ in7; + out4 = tmp1 ^ in1; + out6 = tmp1 ^ in4; + out7 = tmp0 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_D7(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in3; + out3 = in2 ^ in5 ^ in7; + out2 = tmp0 ^ in5; + tmp1 = tmp0 ^ out3 ^ in1; + out1 = tmp1 ^ in6; + out4 = tmp1 ^ in4; + tmp2 = out1 ^ in4; + out6 = tmp2 ^ in1; + out7 = tmp2 ^ in2; + out0 = tmp2 ^ in3; + out5 = tmp2 ^ in0 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_D8(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in0; + out5 = in1; + tmp0 = in1 ^ in2; + out6 = in0 ^ in2; + out0 = tmp0 ^ in4; + tmp1 = tmp0 ^ in3; + out7 = tmp1 ^ out6; + out2 = tmp1 ^ in6; + out3 = out7 ^ in7; + out1 = tmp1 ^ in1 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_D9(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in0 ^ in4; + out5 = in1 ^ in5; + out2 = in1 ^ in3 ^ in6; + out3 = in0 ^ in1 ^ in7; + out6 = in0 ^ in2 ^ in6; + out0 = out4 ^ in1 ^ in2; + out1 = out5 ^ in2 ^ in3; + out7 = out3 ^ in3; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_DA(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out5 = in1 ^ in4; + tmp0 = in2 ^ in7; + tmp1 = in0 ^ in2 ^ in3; + out0 = tmp0 ^ out5; + out4 = tmp0 ^ tmp1; + out2 = tmp0 ^ in3 ^ in6; + out1 = tmp1 ^ in5; + out3 = tmp1 ^ in1; + out6 = out1 ^ in3; + out7 = out3 ^ in2 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_DB(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = in1 ^ in5; + tmp2 = in3 ^ in7; + out3 = tmp0 ^ in2; + out5 = tmp1 ^ in4; + out6 = tmp1 ^ out3 ^ in6; + out2 = tmp2 ^ in6; + tmp3 = tmp2 ^ in4; + tmp4 = out3 ^ in3; + out4 = tmp3 ^ in0; + out1 = tmp4 ^ in5; + out0 = tmp3 ^ tmp4; + out7 = tmp0 ^ out2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_DC(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in2; + tmp1 = in0 ^ in3; + out6 = tmp0 ^ in4; + tmp2 = tmp0 ^ in7; + out3 = tmp1 ^ in6; + tmp3 = tmp1 ^ in1; + out1 = tmp1 ^ tmp2 ^ in5; + out4 = tmp2 ^ in6; + out2 = tmp3 ^ in2; + out7 = tmp3 ^ in5; + out5 = tmp2 ^ out2; + out0 = out2 ^ out3 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_DD(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in0 ^ in6; + out2 = in0 ^ in1 ^ in3; + out6 = out3 ^ in2 ^ in4; + out7 = out2 ^ in5 ^ in7; + out0 = out6 ^ in1; + out4 = out6 ^ in7; + out5 = out7 ^ in0; + out1 = out5 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_DE(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3 ^ in6; + tmp1 = in3 ^ in4 ^ in7; + out4 = tmp0 ^ in0; + out5 = tmp1 ^ in1; + out3 = out4 ^ in7; + out2 = out3 ^ in6; + out1 = out2 ^ in5; + out6 = tmp1 ^ out1; + out0 = tmp0 ^ out5; + out7 = out0 ^ out1 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_DF(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in0 ^ in3 ^ in7; + tmp0 = out2 ^ in1 ^ in5; + out1 = tmp0 ^ in2; + out7 = tmp0 ^ in6; + out5 = tmp0 ^ in0 ^ in4; + tmp1 = out1 ^ out5 ^ in6; + out4 = tmp1 ^ in3; + out6 = tmp1 ^ in5; + tmp2 = tmp1 ^ in7; + out0 = tmp2 ^ in1; + out3 = tmp2 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_E0(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in1 ^ in7; + tmp0 = in2 ^ in4; + out4 = out3 ^ in3 ^ in5; + out2 = tmp0 ^ in1; + tmp1 = tmp0 ^ in6; + out0 = out4 ^ in2; + out6 = out4 ^ in0; + out1 = tmp1 ^ in3; + out5 = tmp1 ^ in0; + out7 = out5 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_E1(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in1 ^ in4; + tmp0 = in1 ^ in7; + out3 = tmp0 ^ in3; + tmp1 = out3 ^ in5; + out4 = tmp1 ^ in4; + tmp2 = tmp1 ^ in0; + out0 = tmp2 ^ in2; + out6 = tmp2 ^ in6; + tmp3 = out0 ^ out4 ^ in6; + out5 = tmp3 ^ in5; + out7 = tmp0 ^ tmp3; + out1 = tmp2 ^ out5 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_E2(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in1 ^ in2; + out4 = in1 ^ in5; + out2 = in2 ^ in4 ^ in7; + out5 = in0 ^ in2 ^ in6; + out0 = out3 ^ in3 ^ in5; + out7 = out3 ^ in0 ^ in4; + out6 = out2 ^ out7 ^ in3; + out1 = out5 ^ in3 ^ in4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_E3(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in4 ^ in7; + tmp0 = in1 ^ in3; + out3 = tmp0 ^ in2; + tmp1 = out3 ^ in0; + out0 = tmp1 ^ in5; + tmp2 = tmp1 ^ in4; + out1 = tmp2 ^ in6; + tmp3 = tmp2 ^ in3; + out7 = tmp3 ^ in7; + out6 = out1 ^ out2 ^ in2; + tmp4 = tmp0 ^ out0; + out5 = tmp4 ^ in6; + out4 = tmp3 ^ tmp4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_E4(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in6; + tmp0 = in0 ^ in4; + tmp1 = tmp0 ^ in2 ^ in6; + out2 = tmp1 ^ in1; + out7 = out2 ^ in5; + tmp2 = tmp0 ^ out7; + out4 = tmp2 ^ in3; + out0 = out4 ^ in7; + out6 = tmp1 ^ out0; + out5 = tmp2 ^ out6; + out1 = out5 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_E5(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in3 ^ in6; + tmp0 = in0 ^ in1; + tmp1 = in5 ^ in7; + out2 = tmp0 ^ in4 ^ in6; + tmp2 = tmp1 ^ out2; + out6 = tmp2 ^ in3; + out7 = tmp2 ^ in2; + out0 = out6 ^ in2 ^ in4; + out5 = out6 ^ in1 ^ in2; + out1 = tmp0 ^ out5 ^ in5; + out4 = tmp1 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_E6(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in2 ^ in6 ^ in7; + out2 = out3 ^ in0 ^ in4; + out4 = out3 ^ in1 ^ in5; + out1 = out2 ^ in3; + out7 = out2 ^ out4 ^ in2; + out0 = out4 ^ in3 ^ in7; + out5 = out1 ^ in4; + out6 = out0 ^ out2 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_E7(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in3; + out3 = tmp0 ^ in6 ^ in7; + tmp1 = out3 ^ in0; + out5 = tmp1 ^ in5; + tmp2 = tmp1 ^ in4; + tmp3 = out5 ^ in7; + out1 = tmp2 ^ in1; + out0 = tmp3 ^ in1; + out6 = out1 ^ in2; + out2 = tmp0 ^ tmp2; + tmp4 = tmp3 ^ out6; + out4 = tmp4 ^ in6; + out7 = tmp4 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_E8(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in3 ^ in6; + tmp0 = in4 ^ in7; + out1 = in2 ^ in3 ^ in4; + out5 = tmp0 ^ in0; + tmp1 = tmp0 ^ in1; + tmp2 = tmp1 ^ in5; + out0 = tmp1 ^ out1; + out2 = tmp2 ^ in2; + out6 = tmp2 ^ out5; + tmp3 = out6 ^ in6; + out3 = tmp3 ^ in7; + out7 = tmp3 ^ in2 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_E9(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = in3 ^ in6; + tmp2 = tmp0 ^ in6; + out4 = tmp1 ^ in4; + out6 = tmp2 ^ in5; + out7 = tmp2 ^ in2 ^ in7; + out3 = out6 ^ in3 ^ in7; + out0 = tmp1 ^ out7; + out2 = out3 ^ out4 ^ in0; + out5 = tmp0 ^ out2; + out1 = out0 ^ out5 ^ in5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_EA(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in6 ^ in7; + out5 = in0 ^ in7; + out6 = in0 ^ in1; + out0 = in1 ^ in2 ^ in3; + out2 = in2 ^ in4 ^ in5; + out7 = out6 ^ in2; + out1 = out0 ^ out6 ^ in4; + out3 = out7 ^ in5 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_EB(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in4 ^ in5; + tmp0 = in0 ^ in1; + out4 = in4 ^ in6 ^ in7; + out5 = in0 ^ in5 ^ in7; + out6 = tmp0 ^ in6; + tmp1 = tmp0 ^ in2; + out0 = tmp1 ^ in3; + out7 = tmp1 ^ in7; + out1 = out0 ^ in4; + out3 = out0 ^ in5 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_EC(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out3 = in0 ^ in5; + out4 = in2 ^ in3 ^ in7; + out5 = in0 ^ in3 ^ in4; + out6 = out3 ^ in1 ^ in4; + out1 = out4 ^ in4; + out0 = out4 ^ in1 ^ in6; + out2 = out0 ^ out5 ^ in5; + out7 = out2 ^ in4 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_ED(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in2 ^ in4; + tmp1 = in3 ^ in5; + out4 = tmp0 ^ in3 ^ in7; + out3 = tmp1 ^ in0; + out1 = out4 ^ in1; + out5 = out3 ^ in4; + out7 = out1 ^ out5 ^ in6; + out2 = tmp0 ^ out7; + out0 = tmp1 ^ out7; + out6 = out2 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_EE(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in2; + tmp0 = in0 ^ in1; + out5 = in0 ^ in3; + tmp1 = tmp0 ^ in2; + out6 = tmp0 ^ in4; + tmp2 = tmp1 ^ out5; + out7 = tmp1 ^ in5; + out1 = tmp2 ^ out6 ^ in7; + out0 = tmp2 ^ in6; + tmp3 = out7 ^ in1; + out3 = tmp3 ^ in7; + out2 = tmp3 ^ in4 ^ in6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_EF(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out4 = in2 ^ in4; + tmp0 = in0 ^ in5; + tmp1 = in4 ^ in6; + out5 = tmp0 ^ in3; + out2 = tmp0 ^ tmp1; + out6 = tmp1 ^ in0 ^ in1; + out3 = out5 ^ in2 ^ in7; + out7 = out3 ^ in1 ^ in3; + out0 = out4 ^ out6 ^ in3; + out1 = tmp1 ^ out0 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_F0(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2; + tmp1 = in4 ^ in5; + out2 = tmp0 ^ in6; + out3 = tmp1 ^ in1; + tmp2 = tmp1 ^ in7; + out1 = out2 ^ out3 ^ in3; + tmp3 = tmp0 ^ tmp2; + out0 = tmp3 ^ in3; + out5 = tmp3 ^ in0; + out4 = out1 ^ out5 ^ in4; + out7 = out4 ^ in2; + out6 = tmp2 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_F1(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in1 ^ in6; + tmp0 = in3 ^ in5; + out3 = tmp0 ^ in1 ^ in4; + tmp1 = out3 ^ in2; + out1 = tmp1 ^ in6; + tmp2 = tmp1 ^ in0; + tmp3 = out1 ^ in5; + out0 = tmp2 ^ in7; + out6 = tmp2 ^ in4; + out7 = tmp3 ^ in0; + out5 = tmp0 ^ out0; + out4 = tmp3 ^ out5 ^ in1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_F2(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in4 ^ in5; + out2 = in2 ^ in6 ^ in7; + tmp1 = tmp0 ^ in1; + tmp2 = tmp1 ^ in2; + out0 = tmp2 ^ in3; + out3 = tmp2 ^ in7; + out5 = out3 ^ in0 ^ in4; + tmp3 = tmp0 ^ out5; + out7 = tmp3 ^ in3; + out4 = tmp3 ^ out2; + out1 = out0 ^ out4 ^ in4; + out6 = tmp1 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_F3(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in6 ^ in7; + tmp0 = in0 ^ in1; + out4 = tmp0 ^ in6; + tmp1 = tmp0 ^ in2; + out5 = tmp1 ^ in7; + out6 = tmp1 ^ in3; + out7 = out6 ^ in4; + out0 = out7 ^ in5; + out1 = out0 ^ in6; + out3 = out0 ^ in0 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_F4(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in0 ^ in1 ^ in2; + tmp0 = out2 ^ in3; + out4 = tmp0 ^ in4; + out5 = out4 ^ in5; + out6 = out5 ^ in6; + out7 = out6 ^ in7; + out0 = out7 ^ in0; + out1 = out0 ^ in1; + out3 = tmp0 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_F5(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in0 ^ in1; + tmp0 = out2 ^ in2; + out4 = tmp0 ^ in3; + out5 = out4 ^ in4; + out6 = out5 ^ in5; + out7 = out6 ^ in6; + out0 = out7 ^ in7; + out1 = out0 ^ in0; + out3 = tmp0 ^ out0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_F6(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in7; + out2 = tmp0 ^ in2; + out4 = out2 ^ in1 ^ in4; + out7 = out4 ^ in3 ^ in5; + out5 = out7 ^ in4 ^ in7; + out0 = tmp0 ^ out7 ^ in6; + tmp1 = out0 ^ in1; + out6 = out0 ^ in0 ^ in5; + out3 = tmp1 ^ in3; + out1 = tmp0 ^ tmp1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_F7(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in0 ^ in7; + tmp0 = out2 ^ in1; + out4 = tmp0 ^ in2; + out5 = out4 ^ in3 ^ in7; + out6 = out5 ^ in4; + out7 = out6 ^ in5; + out0 = out7 ^ in6; + out1 = out0 ^ in7; + out3 = tmp0 ^ out1; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_F8(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in4; + tmp1 = in3 ^ in5; + tmp2 = tmp0 ^ in6; + out4 = tmp0 ^ tmp1; + out1 = tmp1 ^ in2 ^ in4; + out3 = tmp2 ^ in1; + out5 = out3 ^ in5; + out7 = out1 ^ out5 ^ in7; + out6 = tmp1 ^ out7; + out0 = tmp2 ^ out7; + out2 = out6 ^ in0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_F9(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in3 ^ in5; + tmp1 = in0 ^ in6; + out4 = tmp0 ^ in0; + tmp2 = tmp1 ^ in4; + tmp3 = tmp1 ^ in2; + out5 = tmp2 ^ in1; + out3 = out5 ^ in3; + tmp4 = tmp3 ^ out3; + out1 = tmp4 ^ in5; + out0 = tmp4 ^ in0 ^ in7; + out6 = tmp0 ^ out0 ^ in4; + out7 = tmp2 ^ tmp4; + out2 = tmp3 ^ out6; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_FA(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in1; + tmp1 = tmp0 ^ in2; + tmp2 = tmp0 ^ in5; + tmp3 = tmp1 ^ in7; + out5 = tmp2 ^ in6; + out6 = tmp3 ^ in6; + out7 = tmp3 ^ in3; + out3 = out6 ^ in4; + out2 = tmp1 ^ out5; + out4 = out2 ^ out3 ^ in1; + out0 = out4 ^ out7 ^ in5; + out1 = tmp2 ^ out0; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_FB(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in5 ^ in6; + tmp0 = in0 ^ in1; + out4 = in0 ^ in5 ^ in7; + out5 = tmp0 ^ in6; + tmp1 = tmp0 ^ in2; + out6 = tmp1 ^ in7; + out7 = tmp1 ^ in3; + out0 = out7 ^ in4; + out1 = out0 ^ in5; + out3 = out0 ^ in6 ^ in7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_FC(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in1 ^ in2; + tmp1 = in0 ^ in7; + out2 = tmp0 ^ tmp1 ^ in5; + out3 = tmp1 ^ in4; + tmp2 = out2 ^ in6; + out6 = tmp2 ^ in4; + out7 = tmp2 ^ in3; + out4 = out6 ^ in1 ^ in3; + tmp3 = out4 ^ in0; + out1 = tmp3 ^ in6; + out0 = tmp3 ^ in1 ^ in5; + out5 = tmp0 ^ out4; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_FD(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in5; + tmp1 = in1 ^ in7; + out2 = tmp0 ^ tmp1; + out6 = out2 ^ in2 ^ in4; + tmp2 = out6 ^ in0; + out1 = tmp2 ^ in3; + out0 = tmp0 ^ out1 ^ in6; + out5 = out0 ^ in2; + tmp3 = out5 ^ in1; + out3 = tmp3 ^ in6; + out7 = tmp2 ^ tmp3; + out4 = tmp1 ^ out7; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_FE(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + tmp0 = in0 ^ in2; + out2 = tmp0 ^ in5; + out3 = tmp0 ^ in4; + tmp1 = out3 ^ in6; + out4 = tmp1 ^ in5; + tmp2 = tmp1 ^ in1; + out6 = tmp2 ^ in7; + tmp3 = tmp2 ^ in0; + out0 = tmp3 ^ in3; + tmp4 = out0 ^ out4 ^ in7; + out5 = tmp4 ^ in6; + out7 = tmp4 ^ in2; + out1 = tmp3 ^ out5; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void +gf8_muladd_FF(void *out, void *in) +{ + unsigned int i; + uint64_t *in_ptr = (uint64_t *)in; + uint64_t *out_ptr = (uint64_t *)out; + + for (i = 0; i < WIDTH; i++) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + uint64_t tmp0, tmp1, tmp2, tmp3; + + uint64_t in0 = out_ptr[0]; + uint64_t in1 = out_ptr[WIDTH]; + uint64_t in2 = out_ptr[WIDTH * 2]; + uint64_t in3 = out_ptr[WIDTH * 3]; + uint64_t in4 = out_ptr[WIDTH * 4]; + uint64_t in5 = out_ptr[WIDTH * 5]; + uint64_t in6 = out_ptr[WIDTH * 6]; + uint64_t in7 = out_ptr[WIDTH * 7]; + + out2 = in0 ^ in5; + tmp0 = in4 ^ in7; + tmp1 = out2 ^ in2; + out4 = tmp1 ^ in6; + out7 = tmp1 ^ in1 ^ in3; + out1 = tmp0 ^ out7; + tmp2 = out1 ^ in5; + out6 = tmp2 ^ in3; + tmp3 = tmp2 ^ in7; + out0 = tmp3 ^ in6; + out3 = tmp3 ^ in1; + out5 = tmp0 ^ out0 ^ in2; + + out_ptr[0] = out0 ^ in_ptr[0]; + out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; + out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; + out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; + out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; + out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; + out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; + out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + + in_ptr++; + out_ptr++; + } +} + +static void (*gf8_muladd[])(void *out, void *in) = { + gf8_muladd_00, gf8_muladd_01, gf8_muladd_02, gf8_muladd_03, gf8_muladd_04, + gf8_muladd_05, gf8_muladd_06, gf8_muladd_07, gf8_muladd_08, gf8_muladd_09, + gf8_muladd_0A, gf8_muladd_0B, gf8_muladd_0C, gf8_muladd_0D, gf8_muladd_0E, + gf8_muladd_0F, gf8_muladd_10, gf8_muladd_11, gf8_muladd_12, gf8_muladd_13, + gf8_muladd_14, gf8_muladd_15, gf8_muladd_16, gf8_muladd_17, gf8_muladd_18, + gf8_muladd_19, gf8_muladd_1A, gf8_muladd_1B, gf8_muladd_1C, gf8_muladd_1D, + gf8_muladd_1E, gf8_muladd_1F, gf8_muladd_20, gf8_muladd_21, gf8_muladd_22, + gf8_muladd_23, gf8_muladd_24, gf8_muladd_25, gf8_muladd_26, gf8_muladd_27, + gf8_muladd_28, gf8_muladd_29, gf8_muladd_2A, gf8_muladd_2B, gf8_muladd_2C, + gf8_muladd_2D, gf8_muladd_2E, gf8_muladd_2F, gf8_muladd_30, gf8_muladd_31, + gf8_muladd_32, gf8_muladd_33, gf8_muladd_34, gf8_muladd_35, gf8_muladd_36, + gf8_muladd_37, gf8_muladd_38, gf8_muladd_39, gf8_muladd_3A, gf8_muladd_3B, + gf8_muladd_3C, gf8_muladd_3D, gf8_muladd_3E, gf8_muladd_3F, gf8_muladd_40, + gf8_muladd_41, gf8_muladd_42, gf8_muladd_43, gf8_muladd_44, gf8_muladd_45, + gf8_muladd_46, gf8_muladd_47, gf8_muladd_48, gf8_muladd_49, gf8_muladd_4A, + gf8_muladd_4B, gf8_muladd_4C, gf8_muladd_4D, gf8_muladd_4E, gf8_muladd_4F, + gf8_muladd_50, gf8_muladd_51, gf8_muladd_52, gf8_muladd_53, gf8_muladd_54, + gf8_muladd_55, gf8_muladd_56, gf8_muladd_57, gf8_muladd_58, gf8_muladd_59, + gf8_muladd_5A, gf8_muladd_5B, gf8_muladd_5C, gf8_muladd_5D, gf8_muladd_5E, + gf8_muladd_5F, gf8_muladd_60, gf8_muladd_61, gf8_muladd_62, gf8_muladd_63, + gf8_muladd_64, gf8_muladd_65, gf8_muladd_66, gf8_muladd_67, gf8_muladd_68, + gf8_muladd_69, gf8_muladd_6A, gf8_muladd_6B, gf8_muladd_6C, gf8_muladd_6D, + gf8_muladd_6E, gf8_muladd_6F, gf8_muladd_70, gf8_muladd_71, gf8_muladd_72, + gf8_muladd_73, gf8_muladd_74, gf8_muladd_75, gf8_muladd_76, gf8_muladd_77, + gf8_muladd_78, gf8_muladd_79, gf8_muladd_7A, gf8_muladd_7B, gf8_muladd_7C, + gf8_muladd_7D, gf8_muladd_7E, gf8_muladd_7F, gf8_muladd_80, gf8_muladd_81, + gf8_muladd_82, gf8_muladd_83, gf8_muladd_84, gf8_muladd_85, gf8_muladd_86, + gf8_muladd_87, gf8_muladd_88, gf8_muladd_89, gf8_muladd_8A, gf8_muladd_8B, + gf8_muladd_8C, gf8_muladd_8D, gf8_muladd_8E, gf8_muladd_8F, gf8_muladd_90, + gf8_muladd_91, gf8_muladd_92, gf8_muladd_93, gf8_muladd_94, gf8_muladd_95, + gf8_muladd_96, gf8_muladd_97, gf8_muladd_98, gf8_muladd_99, gf8_muladd_9A, + gf8_muladd_9B, gf8_muladd_9C, gf8_muladd_9D, gf8_muladd_9E, gf8_muladd_9F, + gf8_muladd_A0, gf8_muladd_A1, gf8_muladd_A2, gf8_muladd_A3, gf8_muladd_A4, + gf8_muladd_A5, gf8_muladd_A6, gf8_muladd_A7, gf8_muladd_A8, gf8_muladd_A9, + gf8_muladd_AA, gf8_muladd_AB, gf8_muladd_AC, gf8_muladd_AD, gf8_muladd_AE, + gf8_muladd_AF, gf8_muladd_B0, gf8_muladd_B1, gf8_muladd_B2, gf8_muladd_B3, + gf8_muladd_B4, gf8_muladd_B5, gf8_muladd_B6, gf8_muladd_B7, gf8_muladd_B8, + gf8_muladd_B9, gf8_muladd_BA, gf8_muladd_BB, gf8_muladd_BC, gf8_muladd_BD, + gf8_muladd_BE, gf8_muladd_BF, gf8_muladd_C0, gf8_muladd_C1, gf8_muladd_C2, + gf8_muladd_C3, gf8_muladd_C4, gf8_muladd_C5, gf8_muladd_C6, gf8_muladd_C7, + gf8_muladd_C8, gf8_muladd_C9, gf8_muladd_CA, gf8_muladd_CB, gf8_muladd_CC, + gf8_muladd_CD, gf8_muladd_CE, gf8_muladd_CF, gf8_muladd_D0, gf8_muladd_D1, + gf8_muladd_D2, gf8_muladd_D3, gf8_muladd_D4, gf8_muladd_D5, gf8_muladd_D6, + gf8_muladd_D7, gf8_muladd_D8, gf8_muladd_D9, gf8_muladd_DA, gf8_muladd_DB, + gf8_muladd_DC, gf8_muladd_DD, gf8_muladd_DE, gf8_muladd_DF, gf8_muladd_E0, + gf8_muladd_E1, gf8_muladd_E2, gf8_muladd_E3, gf8_muladd_E4, gf8_muladd_E5, + gf8_muladd_E6, gf8_muladd_E7, gf8_muladd_E8, gf8_muladd_E9, gf8_muladd_EA, + gf8_muladd_EB, gf8_muladd_EC, gf8_muladd_ED, gf8_muladd_EE, gf8_muladd_EF, + gf8_muladd_F0, gf8_muladd_F1, gf8_muladd_F2, gf8_muladd_F3, gf8_muladd_F4, + gf8_muladd_F5, gf8_muladd_F6, gf8_muladd_F7, gf8_muladd_F8, gf8_muladd_F9, + gf8_muladd_FA, gf8_muladd_FB, gf8_muladd_FC, gf8_muladd_FD, gf8_muladd_FE, + gf8_muladd_FF}; + +static uint64_t zero[EC_METHOD_WORD_SIZE * 8] = { + 0, +}; + +void +ec_code_c_prepare(ec_gf_t *gf, uint32_t *values, uint32_t count) +{ + uint32_t i, last, tmp; + + last = 1; + for (i = count; i > 0; i--) { + if (values[i - 1] != 0) { + tmp = values[i - 1]; + values[i - 1] = ec_gf_div(gf, tmp, last); + last = tmp; + } + } +} + +void +ec_code_c_linear(void *dst, void *src, uint64_t offset, uint32_t *values, + uint32_t count) +{ + src += offset; + gf8_muladd_00(dst, src); + while (--count > 0) { + src += EC_METHOD_CHUNK_SIZE; + gf8_muladd[*values](dst, src); + values++; + } +} + +void +ec_code_c_interleaved(void *dst, void **src, uint64_t offset, uint32_t *values, + uint32_t count) +{ + uint32_t i, last, tmp; + + i = 0; + while ((last = *values++) == 0) { + i++; + } + gf8_muladd_00(dst, src[i++] + offset); + while (i < count) { + tmp = *values++; + if (tmp != 0) { + gf8_muladd[last](dst, src[i] + offset); + last = tmp; + } + i++; + } + gf8_muladd[last](dst, zero); +} diff --git a/xlators/cluster/ec/src/ec-code-c.h b/xlators/cluster/ec/src/ec-code-c.h new file mode 100644 index 00000000000..42b5a064eb8 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-c.h @@ -0,0 +1,27 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_C_H__ +#define __EC_CODE_C_H__ + +#include "ec-types.h" + +void +ec_code_c_prepare(ec_gf_t *gf, uint32_t *values, uint32_t count); + +void +ec_code_c_linear(void *dst, void *src, uint64_t offset, uint32_t *values, + uint32_t count); + +void +ec_code_c_interleaved(void *dst, void **src, uint64_t offset, uint32_t *values, + uint32_t count); + +#endif /* __EC_CODE_C_H__ */ diff --git a/xlators/cluster/ec/src/ec-code-intel.c b/xlators/cluster/ec/src/ec-code-intel.c new file mode 100644 index 00000000000..f1c4e13e321 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-intel.c @@ -0,0 +1,594 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <inttypes.h> +#include <string.h> +#include <errno.h> + +#include "ec-code-intel.h" + +static void +ec_code_intel_init(ec_code_intel_t *intel) +{ + memset(intel, 0, sizeof(ec_code_intel_t)); +} + +static void +ec_code_intel_prefix(ec_code_intel_t *intel, uint8_t prefix) +{ + intel->prefix.data[intel->prefix.bytes++] = prefix; +} + +static void +ec_code_intel_rex(ec_code_intel_t *intel, gf_boolean_t w) +{ + gf_boolean_t present = _gf_false; + + if (w) { + intel->rex.w = 1; + present = _gf_true; + } + if (intel->modrm.present) { + if (intel->modrm.reg > 7) { + intel->modrm.reg &= 7; + intel->rex.r = 1; + present = _gf_true; + } + if (intel->sib.present) { + if (intel->sib.index > 7) { + intel->sib.index &= 7; + intel->rex.x = 1; + present = _gf_true; + } + if (intel->sib.base > 7) { + intel->sib.base &= 7; + intel->rex.b = 1; + present = _gf_true; + } + } else if (intel->modrm.rm > 7) { + intel->modrm.rm &= 7; + intel->rex.b = 1; + present = _gf_true; + } + } else if (intel->reg > 7) { + intel->reg &= 7; + intel->rex.b = 1; + present = _gf_true; + } + intel->rex.present = present; +} + +static void +ec_code_intel_vex(ec_code_intel_t *intel, gf_boolean_t w, gf_boolean_t l, + ec_code_vex_opcode_t opcode, ec_code_vex_prefix_t prefix, + uint32_t reg) +{ + ec_code_intel_rex(intel, w); + if (((intel->rex.w == 1) || (intel->rex.x == 0) || (intel->rex.b == 0)) || + ((opcode != VEX_OPCODE_NONE) && (opcode != VEX_OPCODE_0F))) { + intel->rex.present = _gf_false; + + intel->vex.bytes = 3; + intel->vex.data[0] = 0xC4; + intel->vex.data[1] = ((intel->rex.r << 7) | (intel->rex.x << 6) | + (intel->rex.b << 5) | opcode) ^ + 0xE0; + intel->vex.data[2] = (intel->rex.w << 7) | ((~reg & 0x0F) << 3) | + (l ? 0x04 : 0x00) | prefix; + } else { + intel->vex.bytes = 2; + intel->vex.data[0] = 0xC5; + intel->vex.data[1] = (intel->rex.r << 7) | ((~reg & 0x0F) << 3) | + (l ? 0x04 : 0x00) | prefix; + } +} + +static void +ec_code_intel_modrm_reg(ec_code_intel_t *intel, uint32_t rm, uint32_t reg) +{ + intel->modrm.present = _gf_true; + intel->modrm.mod = 3; + intel->modrm.rm = rm; + intel->modrm.reg = reg; +} + +static void +ec_code_intel_modrm_mem(ec_code_intel_t *intel, uint32_t reg, + ec_code_intel_reg_t base, ec_code_intel_reg_t index, + uint32_t scale, int32_t offset) +{ + if (index == REG_SP) { + intel->invalid = _gf_true; + return; + } + if ((index != REG_NULL) && (scale != 1) && (scale != 2) && (scale != 4) && + (scale != 8)) { + intel->invalid = _gf_true; + return; + } + scale >>= 1; + if (scale == 4) { + scale = 3; + } + + intel->modrm.present = _gf_true; + intel->modrm.reg = reg; + + intel->offset.value = offset; + if ((offset == 0) && (base != REG_BP)) { + intel->modrm.mod = 0; + intel->offset.bytes = 0; + } else if ((offset >= -128) && (offset <= 127)) { + intel->modrm.mod = 1; + intel->offset.bytes = 1; + } else { + intel->modrm.mod = 2; + intel->offset.bytes = 4; + } + + intel->modrm.rm = base; + if ((index != REG_NULL) || (base == REG_SP)) { + intel->modrm.rm = 4; + intel->sib.present = _gf_true; + intel->sib.index = index; + if (index == REG_NULL) { + intel->sib.index = 4; + } + intel->sib.scale = scale; + intel->sib.base = base; + if (base == REG_NULL) { + intel->sib.base = 5; + intel->modrm.mod = 0; + intel->offset.bytes = 4; + } + } else if (base == REG_NULL) { + intel->modrm.mod = 0; + intel->modrm.rm = 5; + intel->offset.bytes = 4; + } +} + +static void +ec_code_intel_op_1(ec_code_intel_t *intel, uint8_t opcode, uint32_t reg) +{ + intel->reg = reg; + intel->opcode.bytes = 1; + intel->opcode.data[0] = opcode; +} + +static void +ec_code_intel_op_2(ec_code_intel_t *intel, uint8_t opcode1, uint8_t opcode2, + uint32_t reg) +{ + intel->reg = reg; + intel->opcode.bytes = 2; + intel->opcode.data[0] = opcode1; + intel->opcode.data[1] = opcode2; +} + +static void +ec_code_intel_immediate_1(ec_code_intel_t *intel, uint32_t value) +{ + intel->immediate.bytes = 1; + intel->immediate.value = value; +} + +static void +ec_code_intel_immediate_2(ec_code_intel_t *intel, uint32_t value) +{ + intel->immediate.bytes = 2; + intel->immediate.value = value; +} + +static void +ec_code_intel_immediate_4(ec_code_intel_t *intel, uint32_t value) +{ + intel->immediate.bytes = 4; + intel->immediate.value = value; +} + +static void +ec_code_intel_emit(ec_code_builder_t *builder, ec_code_intel_t *intel) +{ + uint8_t insn[15]; + uint32_t i, count; + + if (intel->invalid) { + ec_code_error(builder, EINVAL); + return; + } + + count = 0; + for (i = 0; i < intel->prefix.bytes; i++) { + insn[count++] = intel->prefix.data[i]; + } + for (i = 0; i < intel->vex.bytes; i++) { + insn[count++] = intel->vex.data[i]; + } + if (intel->rex.present) { + insn[count++] = 0x40 | (intel->rex.w << 3) | (intel->rex.r << 2) | + (intel->rex.x << 1) | (intel->rex.b << 0); + } + for (i = 0; i < intel->opcode.bytes; i++) { + insn[count++] = intel->opcode.data[i]; + } + if (intel->modrm.present) { + insn[count++] = (intel->modrm.mod << 6) | (intel->modrm.reg << 3) | + (intel->modrm.rm << 0); + if (intel->sib.present) { + insn[count++] = (intel->sib.scale << 6) | (intel->sib.index << 3) | + (intel->sib.base << 0); + } + } + for (i = 0; i < intel->offset.bytes; i++) { + insn[count++] = intel->offset.data[i]; + } + for (i = 0; i < intel->immediate.bytes; i++) { + insn[count++] = intel->immediate.data[i]; + } + + ec_code_emit(builder, insn, count); +} + +void +ec_code_intel_op_push_r(ec_code_builder_t *builder, ec_code_intel_reg_t reg) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_op_1(&intel, 0x50 | (reg & 7), reg); + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_pop_r(ec_code_builder_t *builder, ec_code_intel_reg_t reg) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_op_1(&intel, 0x58 | (reg & 7), reg); + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_ret(ec_code_builder_t *builder, uint32_t size) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + if (size == 0) { + ec_code_intel_op_1(&intel, 0xC3, 0); + } else { + ec_code_intel_immediate_2(&intel, size); + ec_code_intel_op_1(&intel, 0xC2, 0); + } + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_r2r(ec_code_builder_t *builder, ec_code_intel_reg_t src, + ec_code_intel_reg_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_reg(&intel, dst, src); + ec_code_intel_op_1(&intel, 0x89, 0); + ec_code_intel_rex(&intel, _gf_true); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_r2m(ec_code_builder_t *builder, ec_code_intel_reg_t src, + ec_code_intel_reg_t base, ec_code_intel_reg_t index, + uint32_t scale, int32_t offset) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_mem(&intel, src, base, index, scale, offset); + ec_code_intel_op_1(&intel, 0x89, 0); + ec_code_intel_rex(&intel, _gf_true); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_m2r(ec_code_builder_t *builder, ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, ec_code_intel_reg_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); + ec_code_intel_op_1(&intel, 0x8B, 0); + ec_code_intel_rex(&intel, _gf_true); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_r2r(ec_code_builder_t *builder, ec_code_intel_reg_t src, + ec_code_intel_reg_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_reg(&intel, dst, src); + ec_code_intel_op_1(&intel, 0x31, 0); + ec_code_intel_rex(&intel, _gf_true); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_m2r(ec_code_builder_t *builder, ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, ec_code_intel_reg_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); + ec_code_intel_op_1(&intel, 0x33, 0); + ec_code_intel_rex(&intel, _gf_true); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_add_i2r(ec_code_builder_t *builder, int32_t value, + ec_code_intel_reg_t reg) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + if ((value >= -128) && (value < 128)) { + ec_code_intel_modrm_reg(&intel, reg, 0); + ec_code_intel_op_1(&intel, 0x83, 0); + ec_code_intel_immediate_1(&intel, value); + } else { + if (reg == REG_AX) { + ec_code_intel_op_1(&intel, 0x05, reg); + } else { + ec_code_intel_modrm_reg(&intel, reg, 0); + ec_code_intel_op_1(&intel, 0x81, 0); + } + ec_code_intel_immediate_4(&intel, value); + } + ec_code_intel_rex(&intel, _gf_true); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_test_i2r(ec_code_builder_t *builder, uint32_t value, + ec_code_intel_reg_t reg) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + if (reg == REG_AX) { + ec_code_intel_op_1(&intel, 0xA9, reg); + } else { + ec_code_intel_modrm_reg(&intel, reg, 0); + ec_code_intel_op_1(&intel, 0xF7, 0); + } + ec_code_intel_immediate_4(&intel, value); + ec_code_intel_rex(&intel, _gf_true); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_jne(ec_code_builder_t *builder, uint32_t address) +{ + ec_code_intel_t intel; + int32_t rel; + + ec_code_intel_init(&intel); + + rel = address - builder->address - 2; + if ((rel >= -128) && (rel < 128)) { + ec_code_intel_op_1(&intel, 0x75, 0); + ec_code_intel_immediate_1(&intel, rel); + } else { + rel -= 4; + ec_code_intel_op_2(&intel, 0x0F, 0x85, 0); + ec_code_intel_immediate_4(&intel, rel); + } + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_sse2sse(ec_code_builder_t *builder, uint32_t src, + uint32_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_prefix(&intel, 0x66); + ec_code_intel_modrm_reg(&intel, src, dst); + ec_code_intel_op_2(&intel, 0x0F, 0x6F, 0); + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_sse2m(ec_code_builder_t *builder, uint32_t src, + ec_code_intel_reg_t base, ec_code_intel_reg_t index, + uint32_t scale, int32_t offset) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_prefix(&intel, 0x66); + ec_code_intel_modrm_mem(&intel, src, base, index, scale, offset); + ec_code_intel_op_2(&intel, 0x0F, 0x7F, 0); + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_m2sse(ec_code_builder_t *builder, ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, uint32_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_prefix(&intel, 0x66); + ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); + ec_code_intel_op_2(&intel, 0x0F, 0x6F, 0); + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_sse2sse(ec_code_builder_t *builder, uint32_t src, + uint32_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_prefix(&intel, 0x66); + ec_code_intel_modrm_reg(&intel, src, dst); + ec_code_intel_op_2(&intel, 0x0F, 0xEF, 0); + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_m2sse(ec_code_builder_t *builder, ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, uint32_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_prefix(&intel, 0x66); + ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); + ec_code_intel_op_2(&intel, 0x0F, 0xEF, 0); + ec_code_intel_rex(&intel, _gf_false); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_avx2avx(ec_code_builder_t *builder, uint32_t src, + uint32_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_reg(&intel, src, dst); + ec_code_intel_op_1(&intel, 0x6F, 0); + ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, VEX_PREFIX_66, + VEX_REG_NONE); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_avx2m(ec_code_builder_t *builder, uint32_t src, + ec_code_intel_reg_t base, ec_code_intel_reg_t index, + uint32_t scale, int32_t offset) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_mem(&intel, src, base, index, scale, offset); + ec_code_intel_op_1(&intel, 0x7F, 0); + ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, VEX_PREFIX_66, + VEX_REG_NONE); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_m2avx(ec_code_builder_t *builder, ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, uint32_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); + ec_code_intel_op_1(&intel, 0x6F, 0); + ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, VEX_PREFIX_66, + VEX_REG_NONE); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_avx2avx(ec_code_builder_t *builder, uint32_t src, + uint32_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_reg(&intel, src, dst); + ec_code_intel_op_1(&intel, 0xEF, 0); + ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, VEX_PREFIX_66, + dst); + + ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_m2avx(ec_code_builder_t *builder, ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, uint32_t dst) +{ + ec_code_intel_t intel; + + ec_code_intel_init(&intel); + + ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); + ec_code_intel_op_1(&intel, 0xEF, 0); + ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, VEX_PREFIX_66, + dst); + + ec_code_intel_emit(builder, &intel); +} diff --git a/xlators/cluster/ec/src/ec-code-intel.h b/xlators/cluster/ec/src/ec-code-intel.h new file mode 100644 index 00000000000..3fa4a174765 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-intel.h @@ -0,0 +1,191 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_INTEL_H__ +#define __EC_CODE_INTEL_H__ + +#include "ec-code.h" + +#define VEX_REG_NONE 0 + +enum _ec_code_intel_reg; +typedef enum _ec_code_intel_reg ec_code_intel_reg_t; + +enum _ec_code_vex_prefix; +typedef enum _ec_code_vex_prefix ec_code_vex_prefix_t; + +enum _ec_code_vex_opcode; +typedef enum _ec_code_vex_opcode ec_code_vex_opcode_t; + +struct _ec_code_intel_buffer; +typedef struct _ec_code_intel_buffer ec_code_intel_buffer_t; + +struct _ec_code_intel_sib; +typedef struct _ec_code_intel_sib ec_code_intel_sib_t; + +struct _ec_code_intel_modrm; +typedef struct _ec_code_intel_modrm ec_code_intel_modrm_t; + +struct _ec_code_intel_rex; +typedef struct _ec_code_intel_rex ec_code_intel_rex_t; + +struct _ec_code_intel; +typedef struct _ec_code_intel ec_code_intel_t; + +enum _ec_code_intel_reg { + REG_NULL = -1, + REG_AX, + REG_CX, + REG_DX, + REG_BX, + REG_SP, + REG_BP, + REG_SI, + REG_DI, + REG_8, + REG_9, + REG_10, + REG_11, + REG_12, + REG_13, + REG_14, + REG_15 +}; + +enum _ec_code_vex_prefix { + VEX_PREFIX_NONE = 0, + VEX_PREFIX_66, + VEX_PREFIX_F3, + VEX_PREFIX_F2 +}; + +enum _ec_code_vex_opcode { + VEX_OPCODE_NONE = 0, + VEX_OPCODE_0F, + VEX_OPCODE_0F_38, + VEX_OPCODE_0F_3A +}; + +struct _ec_code_intel_buffer { + uint32_t bytes; + union { + uint8_t data[4]; + uint32_t value; + }; +}; + +struct _ec_code_intel_sib { + gf_boolean_t present; + uint32_t base; + uint32_t index; + uint32_t scale; +}; + +struct _ec_code_intel_modrm { + gf_boolean_t present; + uint32_t mod; + uint32_t rm; + uint32_t reg; +}; + +struct _ec_code_intel_rex { + gf_boolean_t present; + uint32_t w; + uint32_t r; + uint32_t x; + uint32_t b; +}; + +struct _ec_code_intel { + gf_boolean_t invalid; + ec_code_intel_buffer_t prefix; + ec_code_intel_buffer_t opcode; + ec_code_intel_buffer_t offset; + ec_code_intel_buffer_t immediate; + ec_code_intel_buffer_t vex; + ec_code_intel_rex_t rex; + ec_code_intel_modrm_t modrm; + ec_code_intel_sib_t sib; + uint32_t reg; +}; + +void +ec_code_intel_op_push_r(ec_code_builder_t *builder, ec_code_intel_reg_t reg); +void +ec_code_intel_op_pop_r(ec_code_builder_t *builder, ec_code_intel_reg_t reg); +void +ec_code_intel_op_ret(ec_code_builder_t *builder, uint32_t size); + +void +ec_code_intel_op_mov_r2r(ec_code_builder_t *builder, ec_code_intel_reg_t src, + ec_code_intel_reg_t dst); +void +ec_code_intel_op_mov_r2m(ec_code_builder_t *builder, ec_code_intel_reg_t src, + ec_code_intel_reg_t base, ec_code_intel_reg_t index, + uint32_t scale, int32_t offset); +void +ec_code_intel_op_mov_m2r(ec_code_builder_t *builder, ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, ec_code_intel_reg_t dst); +void +ec_code_intel_op_xor_r2r(ec_code_builder_t *builder, ec_code_intel_reg_t src, + ec_code_intel_reg_t dst); +void +ec_code_intel_op_xor_m2r(ec_code_builder_t *builder, ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, ec_code_intel_reg_t dst); +void +ec_code_intel_op_add_i2r(ec_code_builder_t *builder, int32_t value, + ec_code_intel_reg_t reg); +void +ec_code_intel_op_test_i2r(ec_code_builder_t *builder, uint32_t value, + ec_code_intel_reg_t reg); +void +ec_code_intel_op_jne(ec_code_builder_t *builder, uint32_t address); + +void +ec_code_intel_op_mov_sse2sse(ec_code_builder_t *builder, uint32_t src, + uint32_t dst); +void +ec_code_intel_op_mov_sse2m(ec_code_builder_t *builder, uint32_t src, + ec_code_intel_reg_t base, ec_code_intel_reg_t index, + uint32_t scale, int32_t offset); +void +ec_code_intel_op_mov_m2sse(ec_code_builder_t *builder, ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, uint32_t dst); +void +ec_code_intel_op_xor_sse2sse(ec_code_builder_t *builder, uint32_t src, + uint32_t dst); +void +ec_code_intel_op_xor_m2sse(ec_code_builder_t *builder, ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, uint32_t dst); + +void +ec_code_intel_op_mov_avx2avx(ec_code_builder_t *builder, uint32_t src, + uint32_t dst); +void +ec_code_intel_op_mov_avx2m(ec_code_builder_t *builder, uint32_t src, + ec_code_intel_reg_t base, ec_code_intel_reg_t index, + uint32_t scale, int32_t offset); +void +ec_code_intel_op_mov_m2avx(ec_code_builder_t *builder, ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, uint32_t dst); +void +ec_code_intel_op_xor_avx2avx(ec_code_builder_t *builder, uint32_t src, + uint32_t dst); +void +ec_code_intel_op_xor_m2avx(ec_code_builder_t *builder, ec_code_intel_reg_t base, + ec_code_intel_reg_t index, uint32_t scale, + int32_t offset, uint32_t dst); + +#endif /* __EC_CODE_INTEL_H__ */ diff --git a/xlators/cluster/ec/src/ec-code-sse.c b/xlators/cluster/ec/src/ec-code-sse.c new file mode 100644 index 00000000000..e11e7ff8400 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-sse.c @@ -0,0 +1,101 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <errno.h> + +#include "ec-code-intel.h" + +static void +ec_code_sse_prolog(ec_code_builder_t *builder) +{ + builder->loop = builder->address; +} + +static void +ec_code_sse_epilog(ec_code_builder_t *builder) +{ + ec_code_intel_op_add_i2r(builder, 16, REG_DX); + ec_code_intel_op_add_i2r(builder, 16, REG_DI); + ec_code_intel_op_test_i2r(builder, builder->width - 1, REG_DX); + ec_code_intel_op_jne(builder, builder->loop); + + ec_code_intel_op_ret(builder, 0); +} + +static void +ec_code_sse_load(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, + uint32_t bit) +{ + if (builder->linear) { + ec_code_intel_op_mov_m2sse( + builder, REG_SI, REG_DX, 1, + idx * builder->width * builder->bits + bit * builder->width, dst); + } else { + if (builder->base != idx) { + ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, + REG_AX); + builder->base = idx; + } + ec_code_intel_op_mov_m2sse(builder, REG_AX, REG_DX, 1, + bit * builder->width, dst); + } +} + +static void +ec_code_sse_store(ec_code_builder_t *builder, uint32_t src, uint32_t bit) +{ + ec_code_intel_op_mov_sse2m(builder, src, REG_DI, REG_NULL, 0, + bit * builder->width); +} + +static void +ec_code_sse_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ + ec_code_intel_op_mov_sse2sse(builder, src, dst); +} + +static void +ec_code_sse_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ + ec_code_intel_op_xor_sse2sse(builder, src, dst); +} + +static void +ec_code_sse_xorm(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, + uint32_t bit) +{ + if (builder->linear) { + ec_code_intel_op_xor_m2sse( + builder, REG_SI, REG_DX, 1, + idx * builder->width * builder->bits + bit * builder->width, dst); + } else { + if (builder->base != idx) { + ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, + REG_AX); + builder->base = idx; + } + ec_code_intel_op_xor_m2sse(builder, REG_AX, REG_DX, 1, + bit * builder->width, dst); + } +} + +static char *ec_code_sse_needed_flags[] = {"sse2", NULL}; + +ec_code_gen_t ec_code_gen_sse = {.name = "sse", + .flags = ec_code_sse_needed_flags, + .width = 16, + .prolog = ec_code_sse_prolog, + .epilog = ec_code_sse_epilog, + .load = ec_code_sse_load, + .store = ec_code_sse_store, + .copy = ec_code_sse_copy, + .xor2 = ec_code_sse_xor2, + .xor3 = NULL, + .xorm = ec_code_sse_xorm}; diff --git a/xlators/cluster/ec/src/ec-code-sse.h b/xlators/cluster/ec/src/ec-code-sse.h new file mode 100644 index 00000000000..f1acbcf894b --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-sse.h @@ -0,0 +1,18 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_SSE_H__ +#define __EC_CODE_SSE_H__ + +#include "ec-code.h" + +extern ec_code_gen_t ec_code_gen_sse; + +#endif /* __EC_CODE_SSE_H__ */ diff --git a/xlators/cluster/ec/src/ec-code-x64.c b/xlators/cluster/ec/src/ec-code-x64.c new file mode 100644 index 00000000000..26565b4493f --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-x64.c @@ -0,0 +1,144 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <errno.h> + +#include "ec-code-intel.h" + +static ec_code_intel_reg_t ec_code_x64_regmap[] = { + REG_AX, REG_CX, REG_BP, REG_8, REG_9, REG_10, + REG_11, REG_12, REG_13, REG_14, REG_15}; + +static void +ec_code_x64_prolog(ec_code_builder_t *builder) +{ + uint32_t i; + + ec_code_intel_op_push_r(builder, REG_BP); + if (!builder->linear) { + ec_code_intel_op_push_r(builder, REG_BX); + } + if (builder->regs > 11) { + ec_code_error(builder, EINVAL); + return; + } + for (i = 7; i < builder->regs; i++) { + ec_code_intel_op_push_r(builder, ec_code_x64_regmap[i]); + } + + builder->loop = builder->address; +} + +static void +ec_code_x64_epilog(ec_code_builder_t *builder) +{ + uint32_t i; + + ec_code_intel_op_add_i2r(builder, 8, REG_DX); + ec_code_intel_op_add_i2r(builder, 8, REG_DI); + ec_code_intel_op_test_i2r(builder, builder->width - 1, REG_DX); + ec_code_intel_op_jne(builder, builder->loop); + + if (builder->regs > 11) { + ec_code_error(builder, EINVAL); + return; + } + for (i = builder->regs; i > 7; i--) { + ec_code_intel_op_pop_r(builder, ec_code_x64_regmap[i - 1]); + } + if (!builder->linear) { + ec_code_intel_op_pop_r(builder, REG_BX); + } + ec_code_intel_op_pop_r(builder, REG_BP); + ec_code_intel_op_ret(builder, 0); +} + +static void +ec_code_x64_load(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, + uint32_t bit) +{ + dst = ec_code_x64_regmap[dst]; + + if (builder->linear) { + ec_code_intel_op_mov_m2r( + builder, REG_SI, REG_DX, 1, + idx * builder->width * builder->bits + bit * builder->width, dst); + } else { + if (builder->base != idx) { + ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, + REG_BX); + builder->base = idx; + } + ec_code_intel_op_mov_m2r(builder, REG_BX, REG_DX, 1, + bit * builder->width, dst); + } +} + +static void +ec_code_x64_store(ec_code_builder_t *builder, uint32_t src, uint32_t bit) +{ + src = ec_code_x64_regmap[src]; + + ec_code_intel_op_mov_r2m(builder, src, REG_DI, REG_NULL, 0, + bit * builder->width); +} + +static void +ec_code_x64_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ + dst = ec_code_x64_regmap[dst]; + src = ec_code_x64_regmap[src]; + + ec_code_intel_op_mov_r2r(builder, src, dst); +} + +static void +ec_code_x64_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ + dst = ec_code_x64_regmap[dst]; + src = ec_code_x64_regmap[src]; + + ec_code_intel_op_xor_r2r(builder, src, dst); +} + +static void +ec_code_x64_xorm(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, + uint32_t bit) +{ + dst = ec_code_x64_regmap[dst]; + + if (builder->linear) { + ec_code_intel_op_xor_m2r( + builder, REG_SI, REG_DX, 1, + idx * builder->width * builder->bits + bit * builder->width, dst); + } else { + if (builder->base != idx) { + ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, + REG_BX); + builder->base = idx; + } + ec_code_intel_op_xor_m2r(builder, REG_BX, REG_DX, 1, + bit * builder->width, dst); + } +} + +static char *ec_code_x64_needed_flags[] = {NULL}; + +ec_code_gen_t ec_code_gen_x64 = {.name = "x64", + .flags = ec_code_x64_needed_flags, + .width = sizeof(uint64_t), + .prolog = ec_code_x64_prolog, + .epilog = ec_code_x64_epilog, + .load = ec_code_x64_load, + .store = ec_code_x64_store, + .copy = ec_code_x64_copy, + .xor2 = ec_code_x64_xor2, + .xor3 = NULL, + .xorm = ec_code_x64_xorm}; diff --git a/xlators/cluster/ec/src/ec-code-x64.h b/xlators/cluster/ec/src/ec-code-x64.h new file mode 100644 index 00000000000..bd8174e4bf5 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-x64.h @@ -0,0 +1,18 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_X64_H__ +#define __EC_CODE_X64_H__ + +#include "ec-code.h" + +extern ec_code_gen_t ec_code_gen_x64; + +#endif /* __EC_CODE_X64_H__ */ diff --git a/xlators/cluster/ec/src/ec-code.c b/xlators/cluster/ec/src/ec-code.c new file mode 100644 index 00000000000..03162ae05a9 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code.c @@ -0,0 +1,1060 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <string.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <ctype.h> + +#include <glusterfs/syscall.h> + +#include "ec-mem-types.h" +#include "ec-code.h" +#include "ec-messages.h" +#include "ec-code-c.h" +#include "ec-helpers.h" + +#ifdef USE_EC_DYNAMIC_X64 +#include "ec-code-x64.h" +#endif + +#ifdef USE_EC_DYNAMIC_SSE +#include "ec-code-sse.h" +#endif + +#ifdef USE_EC_DYNAMIC_AVX +#include "ec-code-avx.h" +#endif + +#define EC_CODE_SIZE (1024 * 64) +#define EC_CODE_ALIGN 4096 + +#define EC_CODE_CHUNK_MIN_SIZE 512 + +#define EC_PROC_BUFFER_SIZE 4096 + +#define PROC_CPUINFO "/proc/cpuinfo" + +struct _ec_code_proc; +typedef struct _ec_code_proc ec_code_proc_t; + +struct _ec_code_proc { + int32_t fd; + gf_boolean_t eof; + gf_boolean_t error; + gf_boolean_t skip; + ssize_t size; + ssize_t pos; + char buffer[EC_PROC_BUFFER_SIZE]; +}; + +static ec_code_gen_t *ec_code_gen_table[] = { +#ifdef USE_EC_DYNAMIC_AVX + &ec_code_gen_avx, +#endif +#ifdef USE_EC_DYNAMIC_SSE + &ec_code_gen_sse, +#endif +#ifdef USE_EC_DYNAMIC_X64 + &ec_code_gen_x64, +#endif + NULL}; + +static void +ec_code_arg_set(ec_code_arg_t *arg, uint32_t value) +{ + arg->value = value; +} + +static void +ec_code_arg_assign(ec_code_builder_t *builder, ec_code_op_t *op, + ec_code_arg_t *arg, uint32_t reg) +{ + arg->value = reg; + + if (builder->regs <= reg) { + builder->regs = reg + 1; + } +} + +static void +ec_code_arg_use(ec_code_builder_t *builder, ec_code_op_t *op, + ec_code_arg_t *arg, uint32_t reg) +{ + arg->value = reg; +} + +static void +ec_code_arg_update(ec_code_builder_t *builder, ec_code_op_t *op, + ec_code_arg_t *arg, uint32_t reg) +{ + arg->value = reg; +} + +static ec_code_op_t * +ec_code_op_next(ec_code_builder_t *builder) +{ + ec_code_op_t *op; + + op = &builder->ops[builder->count++]; + memset(op, 0, sizeof(ec_code_op_t)); + + return op; +} + +static void +ec_code_load(ec_code_builder_t *builder, uint32_t bit, uint32_t offset) +{ + ec_code_op_t *op; + + op = ec_code_op_next(builder); + + op->op = EC_GF_OP_LOAD; + ec_code_arg_assign(builder, op, &op->arg1, builder->map[bit]); + ec_code_arg_set(&op->arg2, offset); + ec_code_arg_set(&op->arg3, bit); +} + +static void +ec_code_store(ec_code_builder_t *builder, uint32_t reg, uint32_t bit) +{ + ec_code_op_t *op; + + op = ec_code_op_next(builder); + + op->op = EC_GF_OP_STORE; + ec_code_arg_use(builder, op, &op->arg1, builder->map[reg]); + ec_code_arg_set(&op->arg2, 0); + ec_code_arg_set(&op->arg3, bit); +} + +static void +ec_code_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ + ec_code_op_t *op; + + op = ec_code_op_next(builder); + + op->op = EC_GF_OP_COPY; + ec_code_arg_assign(builder, op, &op->arg1, builder->map[dst]); + ec_code_arg_use(builder, op, &op->arg2, builder->map[src]); + ec_code_arg_set(&op->arg3, 0); +} + +static void +ec_code_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ + ec_code_op_t *op; + + op = ec_code_op_next(builder); + + op->op = EC_GF_OP_XOR2; + ec_code_arg_update(builder, op, &op->arg1, builder->map[dst]); + ec_code_arg_use(builder, op, &op->arg2, builder->map[src]); + ec_code_arg_set(&op->arg3, 0); +} + +static void +ec_code_xor3(ec_code_builder_t *builder, uint32_t dst, uint32_t src1, + uint32_t src2) +{ + ec_code_op_t *op; + + if (builder->code->gen->xor3 == NULL) { + ec_code_copy(builder, dst, src1); + ec_code_xor2(builder, dst, src2); + + return; + } + + op = ec_code_op_next(builder); + + op->op = EC_GF_OP_XOR3; + ec_code_arg_assign(builder, op, &op->arg1, builder->map[dst]); + ec_code_arg_use(builder, op, &op->arg2, builder->map[src1]); + ec_code_arg_use(builder, op, &op->arg3, builder->map[src2]); +} + +static void +ec_code_xorm(ec_code_builder_t *builder, uint32_t bit, uint32_t offset) +{ + ec_code_op_t *op; + + op = ec_code_op_next(builder); + + op->op = EC_GF_OP_XORM; + ec_code_arg_update(builder, op, &op->arg1, builder->map[bit]); + ec_code_arg_set(&op->arg2, offset); + ec_code_arg_set(&op->arg3, bit); +} + +static void +ec_code_dup(ec_code_builder_t *builder, ec_gf_op_t *op) +{ + switch (op->op) { + case EC_GF_OP_COPY: + ec_code_copy(builder, op->arg1, op->arg2); + break; + case EC_GF_OP_XOR2: + ec_code_xor2(builder, op->arg1, op->arg2); + break; + case EC_GF_OP_XOR3: + ec_code_xor3(builder, op->arg1, op->arg2, op->arg3); + break; + default: + break; + } +} + +static void +ec_code_gf_load(ec_code_builder_t *builder, uint32_t offset) +{ + uint32_t i; + + for (i = 0; i < builder->code->gf->bits; i++) { + ec_code_load(builder, i, offset); + } +} + +static void +ec_code_gf_load_xor(ec_code_builder_t *builder, uint32_t offset) +{ + uint32_t i; + + for (i = 0; i < builder->code->gf->bits; i++) { + ec_code_xorm(builder, i, offset); + } +} + +static void +ec_code_gf_store(ec_code_builder_t *builder) +{ + uint32_t i; + + for (i = 0; i < builder->code->gf->bits; i++) { + ec_code_store(builder, i, i); + } +} + +static void +ec_code_gf_clear(ec_code_builder_t *builder) +{ + uint32_t i; + + ec_code_xor2(builder, 0, 0); + for (i = 0; i < builder->code->gf->bits; i++) { + ec_code_store(builder, 0, i); + } +} + +static void +ec_code_gf_mul(ec_code_builder_t *builder, uint32_t value) +{ + ec_gf_mul_t *mul; + ec_gf_op_t *op; + uint32_t map[EC_GF_MAX_REGS]; + int32_t i; + + mul = builder->code->gf->table[value]; + for (op = mul->ops; op->op != EC_GF_OP_END; op++) { + ec_code_dup(builder, op); + } + + for (i = 0; i < mul->regs; i++) { + map[i] = builder->map[mul->map[i]]; + } + memcpy(builder->map, map, sizeof(uint32_t) * mul->regs); +} + +static ec_code_builder_t * +ec_code_prepare(ec_code_t *code, uint32_t count, uint32_t width, + gf_boolean_t linear) +{ + ec_code_builder_t *builder; + uint32_t i; + + count *= code->gf->bits + code->gf->max_ops; + count += code->gf->bits; + builder = GF_MALLOC( + sizeof(ec_code_builder_t) + sizeof(ec_code_op_t) * count, + ec_mt_ec_code_builder_t); + if (builder == NULL) { + return EC_ERR(ENOMEM); + } + + builder->address = 0; + builder->code = code; + builder->size = 0; + builder->count = 0; + builder->regs = 0; + builder->error = 0; + builder->bits = code->gf->bits; + builder->width = width; + builder->data = NULL; + builder->linear = linear; + builder->base = -1; + + for (i = 0; i < EC_GF_MAX_REGS; i++) { + builder->map[i] = i; + } + + return builder; +} + +static size_t +ec_code_space_size(void) +{ + return (sizeof(ec_code_space_t) + 15) & ~15; +} + +static size_t +ec_code_chunk_size(void) +{ + return (sizeof(ec_code_chunk_t) + 15) & ~15; +} + +static ec_code_chunk_t * +ec_code_chunk_from_space(ec_code_space_t *space) +{ + return (ec_code_chunk_t *)((uintptr_t)space + ec_code_space_size()); +} + +static void * +ec_code_to_executable(ec_code_space_t *space, void *addr) +{ + return (void *)((uintptr_t)addr - (uintptr_t)space + + (uintptr_t)space->exec); +} + +static void * +ec_code_from_executable(ec_code_space_t *space, void *addr) +{ + return (void *)((uintptr_t)addr - (uintptr_t)space->exec + + (uintptr_t)space); +} + +static void * +ec_code_func_from_chunk(ec_code_chunk_t *chunk, void **exec) +{ + void *addr; + + addr = (void *)((uintptr_t)chunk + ec_code_chunk_size()); + + *exec = ec_code_to_executable(chunk->space, addr); + + return addr; +} + +static ec_code_chunk_t * +ec_code_chunk_from_func(ec_code_func_linear_t func) +{ + ec_code_chunk_t *chunk; + + chunk = (ec_code_chunk_t *)((uintptr_t)func - ec_code_chunk_size()); + + return ec_code_from_executable(chunk->space, chunk); +} + +static ec_code_chunk_t * +ec_code_chunk_split(ec_code_chunk_t *chunk, size_t size) +{ + ec_code_chunk_t *extra; + ssize_t avail; + + avail = chunk->size - size - ec_code_chunk_size(); + if (avail > 0) { + extra = (ec_code_chunk_t *)((uintptr_t)chunk + chunk->size - avail); + extra->space = chunk->space; + extra->size = avail; + list_add(&extra->list, &chunk->list); + chunk->size = size; + } + list_del_init(&chunk->list); + + return chunk; +} + +static gf_boolean_t +ec_code_chunk_touch(ec_code_chunk_t *prev, ec_code_chunk_t *next) +{ + uintptr_t end; + + end = (uintptr_t)prev + ec_code_chunk_size() + prev->size; + return (end == (uintptr_t)next); +} + +static ec_code_space_t * +ec_code_space_create(ec_code_t *code, size_t size) +{ + char path[] = GLUSTERFS_LIBEXECDIR "/ec-code-dynamic.XXXXXX"; + ec_code_space_t *space; + void *exec; + int32_t fd, err; + + /* We need to create memory areas to store the generated dynamic code. + * Obviously these areas need to be written to be able to create the + * code and they also need to be executable to execute it. + * + * However it's a bad practice to have a memory region that is both + * writable *and* executable. In fact, selinux forbids this and causes + * attempts to do so to fail (unless specifically configured). + * + * To solve the problem we'll use two distinct memory areas mapped to + * the same physical storage. One of the memory areas will have write + * permission, and the other will have execute permission. Both areas + * will have the same contents. The physical storage will be a regular + * file that will be mmapped to both areas. + */ + + /* We need to create a temporary file as the backend storage for the + * memory mapped areas. */ + /* coverity[secure_temp] mkstemp uses 0600 as the mode and is safe */ + fd = mkstemp(path); + if (fd < 0) { + err = errno; + gf_msg(THIS->name, GF_LOG_ERROR, err, EC_MSG_DYN_CREATE_FAILED, + "Unable to create a temporary file for the ec dynamic " + "code"); + space = EC_ERR(err); + goto done; + } + /* Once created we don't need to keep it in the file system. It will + * still exist until we close the last file descriptor or unmap the + * memory areas bound to the file. */ + sys_unlink(path); + + size = (size + EC_CODE_ALIGN - 1) & ~(EC_CODE_ALIGN - 1); + if (sys_ftruncate(fd, size) < 0) { + err = errno; + gf_msg(THIS->name, GF_LOG_ERROR, err, EC_MSG_DYN_CREATE_FAILED, + "Unable to resize the file for the ec dynamic code"); + space = EC_ERR(err); + goto done_close; + } + + /* This creates an executable memory area to be able to run the + * generated fragments of code. */ + exec = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_SHARED, fd, 0); + if (exec == MAP_FAILED) { + err = errno; + gf_msg(THIS->name, GF_LOG_ERROR, err, EC_MSG_DYN_CREATE_FAILED, + "Unable to map the executable area for the ec dynamic " + "code"); + space = EC_ERR(err); + goto done_close; + } + /* It's not important to check the return value of mlock(). If it fails + * everything will continue to work normally. */ + mlock(exec, size); + + /* This maps a read/write memory area to be able to create the dynamici + * code. */ + space = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (space == MAP_FAILED) { + err = errno; + gf_msg(THIS->name, GF_LOG_ERROR, err, EC_MSG_DYN_CREATE_FAILED, + "Unable to map the writable area for the ec dynamic " + "code"); + space = EC_ERR(err); + + munmap(exec, size); + + goto done_close; + } + + space->exec = exec; + space->size = size; + space->code = code; + list_add_tail(&space->list, &code->spaces); + INIT_LIST_HEAD(&space->chunks); + +done_close: + /* If everything has succeeded, we already have the memory areas + * mapped. We don't need the file descriptor anymore because the + * backend storage will be there until the mmap()'d regions are + * unmapped. */ + sys_close(fd); +done: + return space; +} + +static void +ec_code_space_destroy(ec_code_space_t *space) +{ + list_del_init(&space->list); + + munmap(space->exec, space->size); + munmap(space, space->size); +} + +static void +ec_code_chunk_merge(ec_code_chunk_t *chunk) +{ + ec_code_chunk_t *item, *tmp; + + list_for_each_entry_safe(item, tmp, &chunk->space->chunks, list) + { + if ((uintptr_t)item > (uintptr_t)chunk) { + list_add_tail(&chunk->list, &item->list); + if (ec_code_chunk_touch(chunk, item)) { + chunk->size += item->size + ec_code_chunk_size(); + list_del_init(&item->list); + } + + goto check; + } + if (ec_code_chunk_touch(item, chunk)) { + item->size += chunk->size + ec_code_chunk_size(); + list_del_init(&item->list); + chunk = item; + } + } + list_add_tail(&chunk->list, &chunk->space->chunks); + +check: + if (chunk->size == + chunk->space->size - ec_code_space_size() - ec_code_chunk_size()) { + ec_code_space_destroy(chunk->space); + } +} + +static ec_code_chunk_t * +ec_code_space_alloc(ec_code_t *code, size_t size) +{ + ec_code_space_t *space; + ec_code_chunk_t *chunk; + size_t map_size; + + /* To minimize fragmentation, we only allocate chunks of sizes multiples + * of EC_CODE_CHUNK_MIN_SIZE. */ + size = ((size + ec_code_chunk_size() + EC_CODE_CHUNK_MIN_SIZE - 1) & + ~(EC_CODE_CHUNK_MIN_SIZE - 1)) - + ec_code_chunk_size(); + list_for_each_entry(space, &code->spaces, list) + { + list_for_each_entry(chunk, &space->chunks, list) + { + if (chunk->size >= size) { + goto out; + } + } + } + + map_size = EC_CODE_SIZE - ec_code_space_size() - ec_code_chunk_size(); + if (map_size < size) { + map_size = size; + } + space = ec_code_space_create(code, map_size); + if (EC_IS_ERR(space)) { + return (ec_code_chunk_t *)space; + } + + chunk = ec_code_chunk_from_space(space); + chunk->size = map_size - ec_code_space_size() - ec_code_chunk_size(); + list_add(&chunk->list, &space->chunks); + +out: + chunk->space = space; + + return ec_code_chunk_split(chunk, size); +} + +static ec_code_chunk_t * +ec_code_alloc(ec_code_t *code, uint32_t size) +{ + ec_code_chunk_t *chunk; + + LOCK(&code->lock); + + chunk = ec_code_space_alloc(code, size); + + UNLOCK(&code->lock); + + return chunk; +} + +static void +ec_code_free(ec_code_chunk_t *chunk) +{ + gf_lock_t *lock; + + lock = &chunk->space->code->lock; + LOCK(lock); + + ec_code_chunk_merge(chunk); + + UNLOCK(lock); +} + +static int32_t +ec_code_write(ec_code_builder_t *builder) +{ + ec_code_gen_t *gen; + ec_code_op_t *op; + uint32_t i; + + builder->error = 0; + builder->size = 0; + builder->address = 0; + builder->base = -1; + + gen = builder->code->gen; + gen->prolog(builder); + for (i = 0; i < builder->count; i++) { + op = &builder->ops[i]; + switch (op->op) { + case EC_GF_OP_LOAD: + gen->load(builder, op->arg1.value, op->arg2.value, + op->arg3.value); + break; + case EC_GF_OP_STORE: + gen->store(builder, op->arg1.value, op->arg3.value); + break; + case EC_GF_OP_COPY: + gen->copy(builder, op->arg1.value, op->arg2.value); + break; + case EC_GF_OP_XOR2: + gen->xor2(builder, op->arg1.value, op->arg2.value); + break; + case EC_GF_OP_XOR3: + gen->xor3(builder, op->arg1.value, op->arg2.value, + op->arg3.value); + break; + case EC_GF_OP_XORM: + gen->xorm(builder, op->arg1.value, op->arg2.value, + op->arg3.value); + break; + default: + break; + } + } + gen->epilog(builder); + + return builder->error; +} + +static void * +ec_code_compile(ec_code_builder_t *builder) +{ + ec_code_chunk_t *chunk; + void *func; + int32_t err; + + err = ec_code_write(builder); + if (err != 0) { + return EC_ERR(err); + } + + chunk = ec_code_alloc(builder->code, builder->size); + if (EC_IS_ERR(chunk)) { + return chunk; + } + builder->data = ec_code_func_from_chunk(chunk, &func); + + err = ec_code_write(builder); + if (err != 0) { + ec_code_free(chunk); + + return EC_ERR(err); + } + + GF_FREE(builder); + + return func; +} + +ec_code_t * +ec_code_create(ec_gf_t *gf, ec_code_gen_t *gen) +{ + ec_code_t *code; + + code = GF_MALLOC(sizeof(ec_code_t), ec_mt_ec_code_t); + if (code == NULL) { + return EC_ERR(ENOMEM); + } + memset(code, 0, sizeof(ec_code_t)); + INIT_LIST_HEAD(&code->spaces); + LOCK_INIT(&code->lock); + + code->gf = gf; + code->gen = gen; + + return code; +} + +void +ec_code_destroy(ec_code_t *code) +{ + if (!list_empty(&code->spaces)) { + } + + LOCK_DESTROY(&code->lock); + + GF_FREE(code); +} + +static uint32_t +ec_code_value_next(uint32_t *values, uint32_t count, uint32_t *offset) +{ + uint32_t i, next; + + next = 0; + for (i = *offset + 1; i < count; i++) { + next = values[i]; + if (next != 0) { + break; + } + } + *offset = i; + + return next; +} + +static void * +ec_code_build_dynamic(ec_code_t *code, uint32_t width, uint32_t *values, + uint32_t count, gf_boolean_t linear) +{ + ec_code_builder_t *builder; + uint32_t offset, val, next; + + builder = ec_code_prepare(code, count, width, linear); + if (EC_IS_ERR(builder)) { + return builder; + } + + offset = -1; + next = ec_code_value_next(values, count, &offset); + if (next != 0) { + ec_code_gf_load(builder, offset); + do { + val = next; + next = ec_code_value_next(values, count, &offset); + if (next != 0) { + ec_code_gf_mul(builder, ec_gf_div(code->gf, val, next)); + ec_code_gf_load_xor(builder, offset); + } + } while (next != 0); + ec_code_gf_mul(builder, val); + ec_code_gf_store(builder); + } else { + ec_code_gf_clear(builder); + } + + return ec_code_compile(builder); +} + +static void * +ec_code_build(ec_code_t *code, uint32_t width, uint32_t *values, uint32_t count, + gf_boolean_t linear) +{ + void *func; + + if (code->gen != NULL) { + func = ec_code_build_dynamic(code, width, values, count, linear); + if (!EC_IS_ERR(func)) { + return func; + } + + gf_msg_debug(THIS->name, GF_LOG_DEBUG, + "Unable to generate dynamic code. Falling back " + "to precompiled code"); + + /* The dynamic code generation shouldn't fail in normal + * conditions, but if it fails at some point, it's very + * probable that it will fail again, so we completely disable + * dynamic code generation. */ + code->gen = NULL; + } + + ec_code_c_prepare(code->gf, values, count); + + if (linear) { + return ec_code_c_linear; + } + + return ec_code_c_interleaved; +} + +ec_code_func_linear_t +ec_code_build_linear(ec_code_t *code, uint32_t width, uint32_t *values, + uint32_t count) +{ + return (ec_code_func_linear_t)ec_code_build(code, width, values, count, + _gf_true); +} + +ec_code_func_interleaved_t +ec_code_build_interleaved(ec_code_t *code, uint32_t width, uint32_t *values, + uint32_t count) +{ + return (ec_code_func_interleaved_t)ec_code_build(code, width, values, count, + _gf_false); +} + +void +ec_code_release(ec_code_t *code, ec_code_func_t *func) +{ + if ((func->linear != ec_code_c_linear) && + (func->interleaved != ec_code_c_interleaved)) { + ec_code_free(ec_code_chunk_from_func(func->linear)); + } +} + +void +ec_code_error(ec_code_builder_t *builder, int32_t error) +{ + if (builder->error == 0) { + gf_msg(THIS->name, GF_LOG_ERROR, error, EC_MSG_DYN_CODEGEN_FAILED, + "Failed to generate dynamic code"); + builder->error = error; + } +} + +void +ec_code_emit(ec_code_builder_t *builder, uint8_t *bytes, uint32_t count) +{ + if (builder->error != 0) { + return; + } + + if (builder->data != NULL) { + memcpy(builder->data + builder->size, bytes, count); + } + + builder->size += count; + builder->address += count; +} + +static char * +ec_code_proc_trim_left(char *text, ssize_t *length) +{ + ssize_t len; + + for (len = *length; (len > 0) && isspace(*text); len--) { + text++; + } + *length = len; + + return text; +} + +static char * +ec_code_proc_trim_right(char *text, ssize_t *length, char sep) +{ + char *last; + ssize_t len; + + len = *length; + + last = text; + for (len = *length; (len > 0) && (*text != sep); len--) { + if (!isspace(*text)) { + last = text + 1; + } + text++; + } + *last = 0; + *length = len; + + return text; +} + +static char * +ec_code_proc_line_parse(ec_code_proc_t *file, ssize_t *length) +{ + char *text, *end; + ssize_t len; + + len = file->size - file->pos; + text = ec_code_proc_trim_left(file->buffer + file->pos, &len); + end = ec_code_proc_trim_right(text, &len, '\n'); + if (len == 0) { + if (!file->eof) { + if (text == file->buffer) { + file->size = file->pos = 0; + file->skip = _gf_true; + } else { + file->size = file->pos = end - text; + memmove(file->buffer, text, file->pos + 1); + } + len = sys_read(file->fd, file->buffer + file->pos, + sizeof(file->buffer) - file->pos - 1); + if (len > 0) { + file->size += len; + } + file->error = len < 0; + file->eof = len <= 0; + + return NULL; + } + file->size = file->pos = 0; + } else { + file->pos = end - file->buffer + 1; + } + + *length = end - text; + + if (file->skip) { + file->skip = _gf_false; + text = NULL; + } + + return text; +} + +static char * +ec_code_proc_line(ec_code_proc_t *file, ssize_t *length) +{ + char *text; + + text = NULL; + while (!file->eof) { + text = ec_code_proc_line_parse(file, length); + if (text != NULL) { + break; + } + } + + return text; +} + +static char * +ec_code_proc_split(char *text, ssize_t *length, char sep) +{ + text = ec_code_proc_trim_right(text, length, sep); + if (*length == 0) { + return NULL; + } + (*length)--; + text++; + + return ec_code_proc_trim_left(text, length); +} + +static uint32_t +ec_code_cpu_check(uint32_t idx, char *list, uint32_t count) +{ + ec_code_gen_t *gen; + char **ptr; + char *table[count + 1]; + uint32_t i; + + for (i = 0; i < count; i++) { + table[i] = list; + list += strlen(list) + 1; + } + + gen = ec_code_gen_table[idx]; + while (gen != NULL) { + for (ptr = gen->flags; *ptr != NULL; ptr++) { + for (i = 0; i < count; i++) { + if (strcmp(*ptr, table[i]) == 0) { + break; + } + } + if (i >= count) { + gen = ec_code_gen_table[++idx]; + break; + } + } + if (*ptr == NULL) { + break; + } + } + + return idx; +} + +ec_code_gen_t * +ec_code_detect(xlator_t *xl, const char *def) +{ + ec_code_proc_t file; + ec_code_gen_t *gen = NULL; + char *line, *data, *list; + ssize_t length; + uint32_t count, base, select; + + if (strcmp(def, "none") == 0) { + gf_msg(xl->name, GF_LOG_INFO, 0, EC_MSG_EXTENSION_NONE, + "Not using any cpu extensions"); + + return NULL; + } + + file.fd = sys_open(PROC_CPUINFO, O_RDONLY, 0); + if (file.fd < 0) { + goto out; + } + file.size = file.pos = 0; + file.eof = file.error = file.skip = _gf_false; + + select = 0; + if (strcmp(def, "auto") != 0) { + while (ec_code_gen_table[select] != NULL) { + if (strcmp(ec_code_gen_table[select]->name, def) == 0) { + break; + } + select++; + } + if (ec_code_gen_table[select] == NULL) { + gf_msg(xl->name, GF_LOG_WARNING, EINVAL, EC_MSG_EXTENSION_UNKNOWN, + "CPU extension '%s' is not known. Not using any cpu " + "extensions", + def); + + return NULL; + } + } else { + def = NULL; + } + + while ((line = ec_code_proc_line(&file, &length)) != NULL) { + data = ec_code_proc_split(line, &length, ':'); + if ((data != NULL) && (strcmp(line, "flags") == 0)) { + list = data; + count = 0; + while ((data != NULL) && (*data != 0)) { + count++; + data = ec_code_proc_split(data, &length, ' '); + } + base = select; + select = ec_code_cpu_check(select, list, count); + if ((base != select) && (def != NULL)) { + gf_msg(xl->name, GF_LOG_WARNING, ENOTSUP, + EC_MSG_EXTENSION_UNSUPPORTED, + "CPU extension '%s' is not supported", def); + def = NULL; + } + } + } + + if (file.error) { + gf_msg(xl->name, GF_LOG_WARNING, 0, EC_MSG_EXTENSION_FAILED, + "Unable to determine supported CPU extensions. Not using any " + "cpu extensions"); + + gen = NULL; + } else { + gen = ec_code_gen_table[select]; + if (gen == NULL) { + gf_msg(xl->name, GF_LOG_INFO, 0, EC_MSG_EXTENSION_NONE, + "Not using any cpu extensions"); + } else { + gf_msg(xl->name, GF_LOG_INFO, 0, EC_MSG_EXTENSION, + "Using '%s' CPU extensions", gen->name); + } + } + + sys_close(file.fd); + +out: + return gen; +} diff --git a/xlators/cluster/ec/src/ec-code.h b/xlators/cluster/ec/src/ec-code.h new file mode 100644 index 00000000000..75fb35d93e3 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code.h @@ -0,0 +1,44 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_H__ +#define __EC_CODE_H__ + +#include <glusterfs/xlator.h> +#include <glusterfs/list.h> + +#include "ec-types.h" +#include "ec-galois.h" + +ec_code_gen_t * +ec_code_detect(xlator_t *xl, const char *def); + +ec_code_t * +ec_code_create(ec_gf_t *gf, ec_code_gen_t *gen); + +void +ec_code_destroy(ec_code_t *code); + +ec_code_func_linear_t +ec_code_build_linear(ec_code_t *code, uint32_t width, uint32_t *values, + uint32_t count); +ec_code_func_interleaved_t +ec_code_build_interleaved(ec_code_t *code, uint32_t width, uint32_t *values, + uint32_t count); +void +ec_code_release(ec_code_t *code, ec_code_func_t *func); + +void +ec_code_error(ec_code_builder_t *builder, int32_t error); + +void +ec_code_emit(ec_code_builder_t *builder, uint8_t *bytes, uint32_t count); + +#endif /* __EC_CODE_H__ */ diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c new file mode 100644 index 00000000000..703a30e2485 --- /dev/null +++ b/xlators/cluster/ec/src/ec-combine.c @@ -0,0 +1,995 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <fnmatch.h> + +#include "libxlator.h" +#include <glusterfs/byte-order.h> + +#include "ec-types.h" +#include "ec-helpers.h" +#include "ec-common.h" +#include "ec-combine.h" +#include "ec-messages.h" +#include <glusterfs/quota-common-utils.h> + +#define EC_QUOTA_PREFIX "trusted.glusterfs.quota." + +#define EC_MISSING_DATA ((data_t *)1ULL) + +struct _ec_dict_info; +typedef struct _ec_dict_info ec_dict_info_t; + +struct _ec_dict_combine; +typedef struct _ec_dict_combine ec_dict_combine_t; + +struct _ec_dict_info { + dict_t *dict; + int32_t count; +}; + +struct _ec_dict_combine { + ec_cbk_data_t *cbk; + int32_t which; +}; + +int32_t +ec_combine_write(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src) +{ + int valid = 0; + + if (!fop || !dst || !src) + return 0; + + switch (fop->id) { + case GF_FOP_REMOVEXATTR: + case GF_FOP_FREMOVEXATTR: + case GF_FOP_SETXATTR: + case GF_FOP_FSETXATTR: + return 1; + + case GF_FOP_SYMLINK: + case GF_FOP_LINK: + case GF_FOP_CREATE: + case GF_FOP_MKNOD: + case GF_FOP_MKDIR: + valid = 3; + break; + case GF_FOP_UNLINK: + case GF_FOP_RMDIR: + case GF_FOP_SETATTR: + case GF_FOP_FSETATTR: + case GF_FOP_TRUNCATE: + case GF_FOP_FTRUNCATE: + case GF_FOP_WRITE: + case GF_FOP_FALLOCATE: + case GF_FOP_DISCARD: + case GF_FOP_ZEROFILL: + valid = 2; + break; + case GF_FOP_RENAME: + valid = 5; + break; + default: + gf_msg_callingfn(fop->xl->name, GF_LOG_WARNING, EINVAL, + EC_MSG_INVALID_FOP, "Invalid fop %d", fop->id); + return 0; + break; + } + + if (!ec_iatt_combine(fop, dst->iatt, src->iatt, valid)) { + gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_IATT_MISMATCH, + "Mismatching iatt in " + "answers of '%s'", + gf_fop_list[fop->id]); + return 0; + } + return 1; +} + +void +ec_iatt_time_merge(int64_t *dst_sec, uint32_t *dst_nsec, int64_t src_sec, + uint32_t src_nsec) +{ + if ((*dst_sec < src_sec) || + ((*dst_sec == src_sec) && (*dst_nsec < src_nsec))) { + *dst_sec = src_sec; + *dst_nsec = src_nsec; + } +} + +static gf_boolean_t +ec_iatt_is_trusted(ec_fop_data_t *fop, struct iatt *iatt) +{ + uint64_t ino; + int32_t i; + + /* Only the top level fop will have fop->locks filled. */ + while (fop->parent != NULL) { + fop = fop->parent; + } + + /* Lookups are special requests always done without locks taken but they + * require to be able to identify differences between bricks. Special + * handling of these differences is already done in lookup specific code + * so we shouldn't ignore any difference here and consider all iatt + * structures as trusted. */ + if (fop->id == GF_FOP_LOOKUP) { + return _gf_true; + } + + /* Check if the iatt references an inode locked by the current fop */ + for (i = 0; i < fop->lock_count; i++) { + ino = gfid_to_ino(fop->locks[i].lock->loc.inode->gfid); + if (iatt->ia_ino == ino) { + return _gf_true; + } + } + + return _gf_false; +} + +int32_t +ec_iatt_combine(ec_fop_data_t *fop, struct iatt *dst, struct iatt *src, + int32_t count) +{ + int32_t i; + gf_boolean_t failed = _gf_false; + + for (i = 0; i < count; i++) { + /* Check for basic fields. These fields must be equal always, even if + * the inode is not locked because in these cases the parent inode + * will be locked and differences in these fields require changes in + * the parent directory. */ + if ((dst[i].ia_ino != src[i].ia_ino) || + (((dst[i].ia_type == IA_IFBLK) || (dst[i].ia_type == IA_IFCHR)) && + (dst[i].ia_rdev != src[i].ia_rdev)) || + (gf_uuid_compare(dst[i].ia_gfid, src[i].ia_gfid) != 0)) { + failed = _gf_true; + } + /* Check for not so stable fields. These fields can change if the + * inode is not locked. */ + if (!failed && ((dst[i].ia_uid != src[i].ia_uid) || + (dst[i].ia_gid != src[i].ia_gid) || + (st_mode_from_ia(dst[i].ia_prot, dst[i].ia_type) != + st_mode_from_ia(src[i].ia_prot, src[i].ia_type)))) { + if (ec_iatt_is_trusted(fop, dst)) { + /* If the iatt contains information from an inode that is + * locked, these differences are real problems, so we need to + * report them. Otherwise we ignore them and don't care which + * data is returned. */ + failed = _gf_true; + } else { + gf_msg_debug(fop->xl->name, 0, + "Ignoring iatt differences because inode is not " + "locked"); + } + } + if (failed) { + gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_IATT_COMBINE_FAIL, + "Failed to combine iatt (inode: %" PRIu64 "-%" PRIu64 + ", " + "links: %u-%u, uid: %u-%u, gid: %u-%u, " + "rdev: %" PRIu64 "-%" PRIu64 ", size: %" PRIu64 "-%" PRIu64 + ", " + "mode: %o-%o), %s", + dst[i].ia_ino, src[i].ia_ino, dst[i].ia_nlink, + src[i].ia_nlink, dst[i].ia_uid, src[i].ia_uid, dst[i].ia_gid, + src[i].ia_gid, dst[i].ia_rdev, src[i].ia_rdev, + dst[i].ia_size, src[i].ia_size, + st_mode_from_ia(dst[i].ia_prot, dst[i].ia_type), + st_mode_from_ia(src[i].ia_prot, dst[i].ia_type), + ec_msg_str(fop)); + + return 0; + } + } + + while (count-- > 0) { + dst[count].ia_blocks += src[count].ia_blocks; + if (dst[count].ia_blksize < src[count].ia_blksize) { + dst[count].ia_blksize = src[count].ia_blksize; + } + + ec_iatt_time_merge(&dst[count].ia_atime, &dst[count].ia_atime_nsec, + src[count].ia_atime, src[count].ia_atime_nsec); + ec_iatt_time_merge(&dst[count].ia_mtime, &dst[count].ia_mtime_nsec, + src[count].ia_mtime, src[count].ia_mtime_nsec); + ec_iatt_time_merge(&dst[count].ia_ctime, &dst[count].ia_ctime_nsec, + src[count].ia_ctime, src[count].ia_ctime_nsec); + } + + return 1; +} + +void +ec_iatt_rebuild(ec_t *ec, struct iatt *iatt, int32_t count, int32_t answers) +{ + uint64_t blocks; + + while (count-- > 0) { + blocks = iatt[count].ia_blocks * ec->fragments + answers - 1; + blocks /= answers; + iatt[count].ia_blocks = blocks; + } +} + +gf_boolean_t +ec_xattr_match(dict_t *dict, char *key, data_t *value, void *arg) +{ + if ((fnmatch(GF_XATTR_STIME_PATTERN, key, 0) == 0) || + (strcmp(key, GET_LINK_COUNT) == 0) || + (strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) || + (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0) || + (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0)) { + return _gf_false; + } + + return _gf_true; +} + +gf_boolean_t +ec_value_ignore(char *key) +{ + if ((strcmp(key, GF_CONTENT_KEY) == 0) || + (strcmp(key, GF_XATTR_PATHINFO_KEY) == 0) || + (strcmp(key, GF_XATTR_USER_PATHINFO_KEY) == 0) || + (strcmp(key, GF_XATTR_LOCKINFO_KEY) == 0) || + (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0) || + (strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) || + (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0) || + (strncmp(key, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD)) == 0) || + (strcmp(key, DHT_IATT_IN_XDATA_KEY) == 0) || + (strncmp(key, EC_QUOTA_PREFIX, SLEN(EC_QUOTA_PREFIX)) == 0) || + (fnmatch(MARKER_XATTR_PREFIX ".*." XTIME, key, 0) == 0) || + (fnmatch(GF_XATTR_MARKER_KEY ".*", key, 0) == 0) || + (XATTR_IS_NODE_UUID(key))) { + return _gf_true; + } + + return _gf_false; +} + +int32_t +ec_dict_compare(dict_t *dict1, dict_t *dict2) +{ + if (are_dicts_equal(dict1, dict2, ec_xattr_match, ec_value_ignore)) + return 1; + return 0; +} + +static uint32_t +ec_dict_list(data_t **list, ec_cbk_data_t *cbk, int32_t which, char *key, + gf_boolean_t global) +{ + ec_t *ec = cbk->fop->xl->private; + ec_cbk_data_t *ans = NULL; + dict_t *dict = NULL; + data_t *data; + uint32_t count; + int32_t i; + + for (i = 0; i < ec->nodes; i++) { + /* We initialize the list with EC_MISSING_DATA if we are + * returning a global list or the current subvolume belongs + * to the group of the accepted answer. Note that if some + * subvolume is known to be down before issuing the request, + * we won't have any answer from it, so we set here the + * appropriate default value. */ + if (global || ((cbk->mask & (1ULL << i)) != 0)) { + list[i] = EC_MISSING_DATA; + } else { + list[i] = NULL; + } + } + + count = 0; + list_for_each_entry(ans, &cbk->fop->answer_list, answer_list) + { + if (global || ((cbk->mask & ans->mask) != 0)) { + dict = (which == EC_COMBINE_XDATA) ? ans->xdata : ans->dict; + data = dict_get(dict, key); + if (data != NULL) { + list[ans->idx] = data; + count++; + } + } + } + + return count; +} + +int32_t +ec_concat_prepare(xlator_t *xl, char **str, char **sep, char **post, + const char *fmt, va_list args) +{ + char *tmp; + int32_t len; + + len = gf_vasprintf(str, fmt, args); + if (len < 0) { + return -ENOMEM; + } + + tmp = strchr(*str, '{'); + if (tmp == NULL) { + goto out; + } + *tmp++ = 0; + *sep = tmp; + tmp = strchr(tmp, '}'); + if (tmp == NULL) { + goto out; + } + *tmp++ = 0; + *post = tmp; + + return 0; + +out: + gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_FORMAT, + "Invalid concat format"); + + GF_FREE(*str); + + return -EINVAL; +} + +static int32_t +ec_dict_data_concat(ec_cbk_data_t *cbk, int32_t which, char *key, char *new_key, + const char *def, gf_boolean_t global, const char *fmt, ...) +{ + ec_t *ec = cbk->fop->xl->private; + data_t *data[ec->nodes]; + char *str = NULL, *pre = NULL, *sep, *post; + dict_t *dict; + va_list args; + int32_t i, num, len, deflen, prelen, postlen, seplen, tmp; + int32_t err; + + ec_dict_list(data, cbk, which, key, global); + + va_start(args, fmt); + err = ec_concat_prepare(cbk->fop->xl, &pre, &sep, &post, fmt, args); + va_end(args); + + if (err != 0) { + return err; + } + + prelen = strlen(pre); + seplen = strlen(sep); + postlen = strlen(post); + + deflen = 0; + if (def != NULL) { + deflen = strlen(def); + } + + len = prelen + postlen + 1; + num = -1; + for (i = 0; i < ec->nodes; i++) { + if (data[i] == NULL) { + continue; + } + if (data[i] == EC_MISSING_DATA) { + if (def == NULL) { + continue; + } + len += deflen; + } else { + len += data[i]->len - 1; + } + if (num >= 0) { + len += seplen; + } + num++; + } + + err = -ENOMEM; + + str = GF_MALLOC(len, gf_common_mt_char); + if (str == NULL) { + goto out; + } + + memcpy(str, pre, prelen); + len = prelen; + for (i = 0; i < ec->nodes; i++) { + if (data[i] == NULL) { + continue; + } + if (data[i] == EC_MISSING_DATA) { + if (deflen == 0) { + continue; + } + tmp = deflen; + memcpy(str + len, def, tmp); + } else { + tmp = data[i]->len - 1; + memcpy(str + len, data[i]->data, tmp); + } + len += tmp; + if (i < num) { + memcpy(str + len, sep, seplen); + len += seplen; + } + } + memcpy(str + len, post, postlen + 1); + + dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; + if (new_key) { + key = new_key; + } + err = dict_set_dynstr(dict, key, str); + if (err != 0) { + goto out; + } + + str = NULL; + +out: + GF_FREE(str); + GF_FREE(pre); + + return err; +} + +int32_t +ec_dict_data_merge(ec_cbk_data_t *cbk, int32_t which, char *key) +{ + ec_t *ec = cbk->fop->xl->private; + data_t *data[ec->nodes]; + dict_t *dict, *lockinfo, *tmp = NULL; + char *ptr = NULL; + int32_t i, len; + int32_t err; + + ec_dict_list(data, cbk, which, key, _gf_false); + + lockinfo = dict_new(); + if (lockinfo == NULL) { + return -ENOMEM; + } + + for (i = 0; i < ec->nodes; i++) { + if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) { + continue; + } + + tmp = dict_new(); + if (tmp == NULL) { + err = -ENOMEM; + + goto out; + } + err = dict_unserialize(data[i]->data, data[i]->len, &tmp); + if (err != 0) { + goto out; + } + if (dict_copy(tmp, lockinfo) == NULL) { + err = -ENOMEM; + + goto out; + } + + dict_unref(tmp); + } + + tmp = NULL; + + err = dict_allocate_and_serialize(lockinfo, (char **)&ptr, + (unsigned int *)&len); + if (err != 0) { + goto out; + } + + dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; + err = dict_set_dynptr(dict, key, ptr, len); + if (err != 0) { + goto out; + } + + ptr = NULL; + +out: + GF_FREE(ptr); + dict_unref(lockinfo); + if (tmp != NULL) { + dict_unref(tmp); + } + + return err; +} + +int32_t +ec_dict_data_uuid(ec_cbk_data_t *cbk, int32_t which, char *key) +{ + ec_cbk_data_t *ans, *min; + dict_t *src, *dst; + data_t *data; + + min = cbk; + for (ans = cbk->next; ans != NULL; ans = ans->next) { + if (ans->idx < min->idx) { + min = ans; + } + } + + if (min != cbk) { + src = (which == EC_COMBINE_XDATA) ? min->xdata : min->dict; + dst = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; + + data = dict_get(src, key); + if (data == NULL) { + return -ENOENT; + } + if (dict_set(dst, key, data) != 0) { + return -ENOMEM; + } + } + + return 0; +} + +int32_t +ec_dict_data_iatt(ec_cbk_data_t *cbk, int32_t which, char *key) +{ + ec_t *ec = cbk->fop->xl->private; + data_t *data[ec->nodes]; + dict_t *dict; + struct iatt *stbuf, *tmp; + int32_t i, ret; + + ec_dict_list(data, cbk, which, key, _gf_false); + + stbuf = NULL; + for (i = 0; i < ec->nodes; i++) { + if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) { + continue; + } + tmp = data_to_iatt(data[i], key); + if (tmp == NULL) { + ret = -EINVAL; + goto out; + } + if (stbuf == NULL) { + stbuf = GF_MALLOC(sizeof(struct iatt), gf_common_mt_char); + if (stbuf == NULL) { + ret = -ENOMEM; + goto out; + } + *stbuf = *tmp; + } else { + if (!ec_iatt_combine(cbk->fop, stbuf, tmp, 1)) { + ret = -EINVAL; + goto out; + } + } + } + + if ((stbuf != NULL) && (stbuf->ia_type == IA_IFREG)) { + ec_iatt_rebuild(ec, stbuf, 1, cbk->count); + /* TODO: not sure if an iatt could come in xdata from a fop that takes + * no locks. */ + if (!ec_get_inode_size(cbk->fop, cbk->fop->locks[0].lock->loc.inode, + &stbuf->ia_size)) { + ret = -EINVAL; + goto out; + } + } + + dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; + ret = dict_set_iatt(dict, key, stbuf, false); + if (ret >= 0) { + stbuf = NULL; + } + +out: + GF_FREE(stbuf); + + return ret; +} + +int32_t +ec_dict_data_max32(ec_cbk_data_t *cbk, int32_t which, char *key) +{ + ec_t *ec = cbk->fop->xl->private; + data_t *data[ec->nodes]; + dict_t *dict; + int32_t i; + uint32_t max, tmp; + + ec_dict_list(data, cbk, which, key, _gf_false); + + max = 0; + for (i = 0; i < ec->nodes; i++) { + if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) { + continue; + } + + tmp = data_to_uint32(data[i]); + if (max < tmp) { + max = tmp; + } + } + + dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; + return dict_set_uint32(dict, key, max); +} + +int32_t +ec_dict_data_max64(ec_cbk_data_t *cbk, int32_t which, char *key) +{ + ec_t *ec = cbk->fop->xl->private; + data_t *data[ec->nodes]; + dict_t *dict; + int32_t i; + uint64_t max, tmp; + + ec_dict_list(data, cbk, which, key, _gf_false); + + max = 0; + for (i = 0; i < ec->nodes; i++) { + if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) { + continue; + } + + tmp = data_to_uint64(data[i]); + if (max < tmp) { + max = tmp; + } + } + + dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; + return dict_set_uint64(dict, key, max); +} + +int32_t +ec_dict_data_quota(ec_cbk_data_t *cbk, int32_t which, char *key) +{ + ec_t *ec = cbk->fop->xl->private; + data_t *data[ec->nodes]; + dict_t *dict = NULL; + int32_t i = 0; + quota_meta_t size = { + 0, + }; + quota_meta_t max_size = { + 0, + }; + + if (ec_dict_list(data, cbk, which, key, _gf_false) == 0) { + return 0; + } + + /* Quota size xattr is managed outside of the control of the ec xlator. + * This means that it might not be updated at the same time on all + * bricks and we can receive slightly different values. If that's the + * case, we take the maximum of all received values. + */ + for (i = 0; i < ec->nodes; i++) { + if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA) || + (quota_data_to_meta(data[i], &size) < 0)) { + continue; + } + + if (size.size > max_size.size) + max_size.size = size.size; + if (size.file_count > max_size.file_count) + max_size.file_count = size.file_count; + if (size.dir_count > max_size.dir_count) + max_size.dir_count = size.dir_count; + } + + max_size.size *= ec->fragments; + + dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; + return quota_dict_set_meta(dict, key, &max_size, IA_IFDIR); +} + +int32_t +ec_dict_data_stime(ec_cbk_data_t *cbk, int32_t which, char *key) +{ + ec_t *ec = cbk->fop->xl->private; + data_t *data[ec->nodes]; + dict_t *dict; + int32_t i, err; + + ec_dict_list(data, cbk, which, key, _gf_false); + + dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; + for (i = 0; i < ec->nodes; i++) { + if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) { + continue; + } + err = gf_get_max_stime(cbk->fop->xl, dict, key, data[i]); + if (err != 0) { + gf_msg(cbk->fop->xl->name, GF_LOG_ERROR, -err, + EC_MSG_STIME_COMBINE_FAIL, "STIME combination failed"); + + return err; + } + } + + return 0; +} + +int32_t +ec_dict_data_combine(dict_t *dict, char *key, data_t *value, void *arg) +{ + ec_dict_combine_t *data = arg; + + if ((strcmp(key, GF_XATTR_PATHINFO_KEY) == 0) || + (strcmp(key, GF_XATTR_USER_PATHINFO_KEY) == 0)) { + return ec_dict_data_concat(data->cbk, data->which, key, NULL, NULL, + _gf_false, _gf_false, "(<EC:%s> { })", + data->cbk->fop->xl->name); + } + + if (strncmp(key, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD)) == 0) { + return ec_dict_data_concat(data->cbk, data->which, key, NULL, NULL, + _gf_false, "{\n}"); + } + + if (strncmp(key, GF_XATTR_LOCKINFO_KEY, SLEN(GF_XATTR_LOCKINFO_KEY)) == 0) { + return ec_dict_data_merge(data->cbk, data->which, key); + } + + if (strcmp(key, GET_LINK_COUNT) == 0) { + return ec_dict_data_max32(data->cbk, data->which, key); + } + + if (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0) { + return ec_dict_data_max32(data->cbk, data->which, key); + } + if ((strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) || + (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0)) { + return ec_dict_data_max32(data->cbk, data->which, key); + } + + if (strcmp(key, QUOTA_SIZE_KEY) == 0) { + return ec_dict_data_quota(data->cbk, data->which, key); + } + /* Ignore all other quota attributes */ + if (strncmp(key, EC_QUOTA_PREFIX, SLEN(EC_QUOTA_PREFIX)) == 0) { + return 0; + } + + if (XATTR_IS_NODE_UUID(key)) { + if (data->cbk->fop->int32) { + /* List of node uuid is requested */ + return ec_dict_data_concat(data->cbk, data->which, key, + GF_XATTR_LIST_NODE_UUIDS_KEY, UUID0_STR, + _gf_true, "{ }"); + } else { + return ec_dict_data_uuid(data->cbk, data->which, key); + } + } + + if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) { + return ec_dict_data_stime(data->cbk, data->which, key); + } + + if (fnmatch(MARKER_XATTR_PREFIX ".*." XTIME, key, FNM_NOESCAPE) == 0) { + return ec_dict_data_max64(data->cbk, data->which, key); + } + + if (strcmp(key, GF_PRESTAT) == 0 || strcmp(key, GF_POSTSTAT) == 0) { + return ec_dict_data_iatt(data->cbk, data->which, key); + } + + return 0; +} + +int32_t +ec_dict_combine(ec_cbk_data_t *cbk, int32_t which) +{ + dict_t *dict = NULL; + ec_dict_combine_t data; + int32_t err = 0; + + data.cbk = cbk; + data.which = which; + + dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict; + if (dict != NULL) { + err = dict_foreach(dict, ec_dict_data_combine, &data); + if (err != 0) { + gf_msg(cbk->fop->xl->name, GF_LOG_ERROR, -err, + EC_MSG_DICT_COMBINE_FAIL, "Dictionary combination failed"); + + return err; + } + } + + return 0; +} + +int32_t +ec_vector_compare(struct iovec *dst_vector, int32_t dst_count, + struct iovec *src_vector, int32_t src_count) +{ + int32_t dst_size = 0, src_size = 0; + + if (dst_count > 0) { + dst_size = iov_length(dst_vector, dst_count); + } + if (src_count > 0) { + src_size = iov_length(src_vector, src_count); + } + + return (dst_size == src_size); +} + +int32_t +ec_flock_compare(struct gf_flock *dst, struct gf_flock *src) +{ + if ((dst->l_type != src->l_type) || (dst->l_whence != src->l_whence) || + (dst->l_start != src->l_start) || (dst->l_len != src->l_len) || + (dst->l_pid != src->l_pid) || + !is_same_lkowner(&dst->l_owner, &src->l_owner)) { + return 0; + } + + return 1; +} + +void +ec_statvfs_combine(struct statvfs *dst, struct statvfs *src) +{ + if (dst->f_bsize < src->f_bsize) { + dst->f_bsize = src->f_bsize; + } + + if (dst->f_frsize < src->f_frsize) { + dst->f_blocks *= dst->f_frsize; + dst->f_blocks /= src->f_frsize; + + dst->f_bfree *= dst->f_frsize; + dst->f_bfree /= src->f_frsize; + + dst->f_bavail *= dst->f_frsize; + dst->f_bavail /= src->f_frsize; + + dst->f_frsize = src->f_frsize; + } else if (dst->f_frsize > src->f_frsize) { + src->f_blocks *= src->f_frsize; + src->f_blocks /= dst->f_frsize; + + src->f_bfree *= src->f_frsize; + src->f_bfree /= dst->f_frsize; + + src->f_bavail *= src->f_frsize; + src->f_bavail /= dst->f_frsize; + } + if (dst->f_blocks > src->f_blocks) { + dst->f_blocks = src->f_blocks; + } + if (dst->f_bfree > src->f_bfree) { + dst->f_bfree = src->f_bfree; + } + if (dst->f_bavail > src->f_bavail) { + dst->f_bavail = src->f_bavail; + } + + if (dst->f_files < src->f_files) { + dst->f_files = src->f_files; + } + if (dst->f_ffree > src->f_ffree) { + dst->f_ffree = src->f_ffree; + } + if (dst->f_favail > src->f_favail) { + dst->f_favail = src->f_favail; + } + if (dst->f_namemax > src->f_namemax) { + dst->f_namemax = src->f_namemax; + } + + if (dst->f_flag != src->f_flag) { + gf_msg_debug(THIS->name, 0, + "Mismatching file system flags " + "(%lX, %lX)", + dst->f_flag, src->f_flag); + } + dst->f_flag &= src->f_flag; +} + +int32_t +ec_combine_check(ec_cbk_data_t *dst, ec_cbk_data_t *src, ec_combine_f combine) +{ + ec_fop_data_t *fop = dst->fop; + + if (dst->op_ret != src->op_ret) { + gf_msg_debug(fop->xl->name, 0, + "Mismatching return code in " + "answers of '%s': %d <-> %d", + ec_fop_name(fop->id), dst->op_ret, src->op_ret); + + return 0; + } + if (dst->op_ret < 0) { + if (dst->op_errno != src->op_errno) { + gf_msg_debug(fop->xl->name, 0, + "Mismatching errno code in " + "answers of '%s': %d <-> %d", + ec_fop_name(fop->id), dst->op_errno, src->op_errno); + + return 0; + } + } + + if (!ec_dict_compare(dst->xdata, src->xdata)) { + gf_msg(fop->xl->name, GF_LOG_DEBUG, 0, EC_MSG_XDATA_MISMATCH, + "Mismatching xdata in answers " + "of '%s'", + ec_fop_name(fop->id)); + + return 0; + } + + if ((dst->op_ret >= 0) && (combine != NULL)) { + return combine(fop, dst, src); + } + + return 1; +} + +void +ec_combine(ec_cbk_data_t *newcbk, ec_combine_f combine) +{ + ec_fop_data_t *fop = newcbk->fop; + ec_cbk_data_t *cbk = NULL, *tmp = NULL; + struct list_head *item = NULL; + int32_t needed = 0; + char str[32]; + + LOCK(&fop->lock); + + fop->received |= newcbk->mask; + + item = fop->cbk_list.prev; + list_for_each_entry(cbk, &fop->cbk_list, list) + { + if (ec_combine_check(newcbk, cbk, combine)) { + newcbk->count += cbk->count; + newcbk->mask |= cbk->mask; + + item = cbk->list.prev; + while (item != &fop->cbk_list) { + tmp = list_entry(item, ec_cbk_data_t, list); + if (tmp->count >= newcbk->count) { + break; + } + item = item->prev; + } + list_del(&cbk->list); + + newcbk->next = cbk; + + break; + } + } + list_add(&newcbk->list, item); + + ec_trace("ANSWER", fop, "combine=%s[%d]", + ec_bin(str, sizeof(str), newcbk->mask, 0), newcbk->count); + + cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list); + if ((fop->mask ^ fop->remaining) == fop->received) { + needed = fop->minimum - cbk->count; + } + + UNLOCK(&fop->lock); + + if (needed > 0) { + ec_dispatch_next(fop, newcbk->idx); + } +} diff --git a/xlators/cluster/ec/src/ec-combine.h b/xlators/cluster/ec/src/ec-combine.h new file mode 100644 index 00000000000..1010cc3be26 --- /dev/null +++ b/xlators/cluster/ec/src/ec-combine.h @@ -0,0 +1,44 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_COMBINE_H__ +#define __EC_COMBINE_H__ + +#define EC_COMBINE_DICT 0 +#define EC_COMBINE_XDATA 1 + +typedef int32_t (*ec_combine_f)(ec_fop_data_t *fop, ec_cbk_data_t *dst, + ec_cbk_data_t *src); + +void +ec_iatt_rebuild(ec_t *ec, struct iatt *iatt, int32_t count, int32_t answers); + +int32_t +ec_iatt_combine(ec_fop_data_t *fop, struct iatt *dst, struct iatt *src, + int32_t count); +int32_t +ec_dict_compare(dict_t *dict1, dict_t *dict2); +int32_t +ec_vector_compare(struct iovec *dst_vector, int32_t dst_count, + struct iovec *src_vector, int32_t src_count); +int32_t +ec_flock_compare(struct gf_flock *dst, struct gf_flock *src); +void +ec_statvfs_combine(struct statvfs *dst, struct statvfs *src); + +int32_t +ec_dict_combine(ec_cbk_data_t *cbk, int32_t which); + +void +ec_combine(ec_cbk_data_t *cbk, ec_combine_f combine); + +int32_t +ec_combine_write(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src); +#endif /* __EC_COMBINE_H__ */ diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c new file mode 100644 index 00000000000..b955efd8c2d --- /dev/null +++ b/xlators/cluster/ec/src/ec-common.c @@ -0,0 +1,3042 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <glusterfs/byte-order.h> +#include <glusterfs/hashfn.h> + +#include "ec-mem-types.h" +#include "ec-types.h" +#include "ec-helpers.h" +#include "ec-combine.h" +#include "ec-common.h" +#include "ec-fops.h" +#include "ec-method.h" +#include "ec.h" +#include "ec-messages.h" + +#define EC_INVALID_INDEX UINT32_MAX + +void +ec_update_fd_status(fd_t *fd, xlator_t *xl, int idx, int32_t ret_status) +{ + ec_fd_t *fd_ctx; + + if (fd == NULL) + return; + + LOCK(&fd->lock); + { + fd_ctx = __ec_fd_get(fd, xl); + if (fd_ctx) { + if (ret_status >= 0) + fd_ctx->fd_status[idx] = EC_FD_OPENED; + else + fd_ctx->fd_status[idx] = EC_FD_NOT_OPENED; + } + } + UNLOCK(&fd->lock); +} + +static uintptr_t +ec_fd_ctx_need_open(fd_t *fd, xlator_t *this, uintptr_t mask) +{ + int i = 0; + int count = 0; + ec_t *ec = NULL; + ec_fd_t *fd_ctx = NULL; + uintptr_t need_open = 0; + + ec = this->private; + + fd_ctx = ec_fd_get(fd, this); + if (!fd_ctx) + return count; + + LOCK(&fd->lock); + { + for (i = 0; i < ec->nodes; i++) { + if ((fd_ctx->fd_status[i] == EC_FD_NOT_OPENED) && + ((ec->xl_up & (1 << i)) != 0) && ((mask & (1 << i)) != 0)) { + fd_ctx->fd_status[i] = EC_FD_OPENING; + need_open |= (1 << i); + count++; + } + } + } + UNLOCK(&fd->lock); + + /* If fd needs to open on minimum number of nodes + * then ignore fixing the fd as it has been + * requested from heal operation. + */ + if (count >= ec->fragments) { + need_open = 0; + } + + return need_open; +} + +static gf_boolean_t +ec_is_fd_fixable(fd_t *fd) +{ + if (!fd || !fd->inode) + return _gf_false; + else if (fd_is_anonymous(fd)) + return _gf_false; + else if (gf_uuid_is_null(fd->inode->gfid)) + return _gf_false; + + return _gf_true; +} + +static void +ec_fix_open(ec_fop_data_t *fop, uintptr_t mask) +{ + uintptr_t need_open = 0; + int ret = 0; + int32_t flags = 0; + loc_t loc = { + 0, + }; + + if (!ec_is_fd_fixable(fop->fd)) + goto out; + + /* Evaluate how many remote fd's to be opened */ + need_open = ec_fd_ctx_need_open(fop->fd, fop->xl, mask); + if (need_open == 0) { + goto out; + } + + loc.inode = inode_ref(fop->fd->inode); + gf_uuid_copy(loc.gfid, fop->fd->inode->gfid); + ret = loc_path(&loc, NULL); + if (ret < 0) { + goto out; + } + + flags = fop->fd->flags & (~(O_TRUNC | O_APPEND | O_CREAT | O_EXCL)); + if (IA_IFDIR == fop->fd->inode->ia_type) { + ec_opendir(fop->frame, fop->xl, need_open, + EC_MINIMUM_ONE | EC_FOP_NO_PROPAGATE_ERROR, NULL, NULL, + &fop->loc[0], fop->fd, NULL); + } else { + ec_open(fop->frame, fop->xl, need_open, + EC_MINIMUM_ONE | EC_FOP_NO_PROPAGATE_ERROR, NULL, NULL, &loc, + flags, fop->fd, NULL); + } + +out: + loc_wipe(&loc); +} + +static off_t +ec_range_end_get(off_t fl_start, uint64_t fl_size) +{ + if (fl_size > 0) { + if (fl_size >= EC_RANGE_FULL) { + /* Infinity */ + fl_start = LLONG_MAX; + } else { + fl_start += fl_size - 1; + if (fl_start < 0) { + /* Overflow */ + fl_start = LLONG_MAX; + } + } + } + + return fl_start; +} + +static gf_boolean_t +ec_is_range_conflict(ec_lock_link_t *l1, ec_lock_link_t *l2) +{ + return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start)); +} + +static gf_boolean_t +ec_lock_conflict(ec_lock_link_t *l1, ec_lock_link_t *l2) +{ + ec_t *ec = l1->fop->xl->private; + + /* Fops like access/stat won't have to worry what the other fops are + * modifying as the fop is wound only to one brick. So it can be + * executed in parallel*/ + if (l1->fop->minimum == EC_MINIMUM_ONE || + l2->fop->minimum == EC_MINIMUM_ONE) + return _gf_false; + + if ((l1->fop->flags & EC_FLAG_LOCK_SHARED) && + (l2->fop->flags & EC_FLAG_LOCK_SHARED)) + return _gf_false; + + if (!ec->parallel_writes) { + return _gf_true; + } + + return ec_is_range_conflict(l1, l2); +} + +uint32_t +ec_select_first_by_read_policy(ec_t *ec, ec_fop_data_t *fop) +{ + if (ec->read_policy == EC_ROUND_ROBIN) { + return ec->idx; + } else if (ec->read_policy == EC_GFID_HASH) { + if (fop->use_fd) { + return SuperFastHash((char *)fop->fd->inode->gfid, + sizeof(fop->fd->inode->gfid)) % + ec->nodes; + } else { + if (gf_uuid_is_null(fop->loc[0].gfid)) + loc_gfid(&fop->loc[0], fop->loc[0].gfid); + return SuperFastHash((char *)fop->loc[0].gfid, + sizeof(fop->loc[0].gfid)) % + ec->nodes; + } + } + return 0; +} + +static gf_boolean_t +ec_child_valid(ec_t *ec, ec_fop_data_t *fop, uint32_t idx) +{ + return (idx < ec->nodes) && (((fop->remaining >> idx) & 1) == 1); +} + +static uint32_t +ec_child_next(ec_t *ec, ec_fop_data_t *fop, uint32_t idx) +{ + while (!ec_child_valid(ec, fop, idx)) { + if (++idx >= ec->nodes) { + idx = 0; + } + if (idx == fop->first) { + return EC_INVALID_INDEX; + } + } + + return idx; +} + +int32_t +ec_heal_report(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good, + uintptr_t bad, uint32_t pending, dict_t *xdata) +{ + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL, + "Heal failed"); + } else { + if ((mask & ~good) != 0) { + gf_msg(this->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_SUCCESS, + "Heal succeeded on %d/%d " + "subvolumes", + gf_bits_count(mask & ~(good | bad)), + gf_bits_count(mask & ~good)); + } + } + + return 0; +} + +static uintptr_t +ec_fop_needs_name_heal(ec_fop_data_t *fop) +{ + ec_t *ec = NULL; + ec_cbk_data_t *cbk = NULL; + ec_cbk_data_t *enoent_cbk = NULL; + + ec = fop->xl->private; + if (fop->id != GF_FOP_LOOKUP) + return 0; + + if (!fop->loc[0].name || strlen(fop->loc[0].name) == 0) + return 0; + + list_for_each_entry(cbk, &fop->cbk_list, list) + { + if (cbk->op_ret < 0 && cbk->op_errno == ENOENT) { + enoent_cbk = cbk; + break; + } + } + + if (!enoent_cbk) + return 0; + + return ec->xl_up & ~enoent_cbk->mask; +} + +int32_t +ec_fop_needs_heal(ec_fop_data_t *fop) +{ + ec_t *ec = fop->xl->private; + + if (fop->lock_count == 0) { + /* + * if fop->lock_count is zero that means it saw version mismatch + * without any locks so it can't be trusted. If we launch a heal + * based on this it will lead to INODELKs which will affect I/O + * performance. Considering self-heal-daemon and operations on + * the inode from client which take locks can still trigger the + * heal we can choose to not attempt a heal when fop->lock_count + * is zero. + */ + return 0; + } + return (ec->xl_up & ~(fop->remaining | fop->good)) != 0; +} + +void +ec_check_status(ec_fop_data_t *fop) +{ + ec_t *ec = fop->xl->private; + int32_t partial = 0; + char str1[32], str2[32], str3[32], str4[32], str5[32]; + + if (!ec_fop_needs_name_heal(fop) && !ec_fop_needs_heal(fop)) { + return; + } + + if (fop->answer && fop->answer->op_ret >= 0) { + if ((fop->id == GF_FOP_LOOKUP) || (fop->id == GF_FOP_STAT) || + (fop->id == GF_FOP_FSTAT)) { + partial = fop->answer->iatt[0].ia_type == IA_IFDIR; + } else if (fop->id == GF_FOP_OPENDIR) { + partial = 1; + } + } + + gf_msg( + fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS, + "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, " + "remaining=%s, good=%s, bad=%s," + "(Least significant bit represents first client/brick of subvol), %s)", + gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes, + ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), + ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), + ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes), + ec_bin(str4, sizeof(str4), fop->good, ec->nodes), + ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good), + ec->nodes), + ec_msg_str(fop)); + if (fop->use_fd) { + if (fop->fd != NULL) { + ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, + fop->fd, partial, NULL); + } + } else { + ec_heal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, + &fop->loc[0], partial, NULL); + + if (fop->loc[1].inode != NULL) { + ec_heal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, + &fop->loc[1], partial, NULL); + } + } +} + +void +ec_update_good(ec_fop_data_t *fop, uintptr_t good) +{ + fop->good = good; + + /* Fops that are executed only on one brick do not have enough information + * to decide if healing is needed or not. */ + if ((fop->expected != 1) && (fop->parent == NULL)) { + ec_check_status(fop); + } +} + +void +ec_lock_update_good(ec_lock_t *lock, ec_fop_data_t *fop) +{ + /* Fops that are executed only on one brick do not have enough information + * to update the global mask of good bricks. */ + if (fop->expected == 1) { + return; + } + + /* When updating the good mask of the lock, we only take into consideration + * those bits corresponding to the bricks where the fop has been executed. + * Bad bricks are removed from good_mask, but once marked as bad it's never + * set to good until the lock is released and reacquired */ + + lock->good_mask &= fop->good | fop->remaining; +} + +void +__ec_fop_set_error(ec_fop_data_t *fop, int32_t error) +{ + if ((error != 0) && (fop->error == 0)) { + fop->error = error; + } +} + +void +ec_fop_set_error(ec_fop_data_t *fop, int32_t error) +{ + LOCK(&fop->lock); + + __ec_fop_set_error(fop, error); + + UNLOCK(&fop->lock); +} + +gf_boolean_t +ec_cbk_set_error(ec_cbk_data_t *cbk, int32_t error, gf_boolean_t ro) +{ + if ((error != 0) && (cbk->op_ret >= 0)) { + /* If cbk->op_errno was 0, it means that the fop succeeded and this + * error has happened while processing the answer. If the operation was + * read-only, there's no problem (i.e. we simply return the generated + * error code). However if it caused a modification, we must return EIO + * to indicate that the operation has been partially executed. */ + cbk->op_errno = ro ? error : EIO; + cbk->op_ret = -1; + + ec_fop_set_error(cbk->fop, cbk->op_errno); + } + + return (cbk->op_ret < 0); +} + +ec_cbk_data_t * +ec_fop_prepare_answer(ec_fop_data_t *fop, gf_boolean_t ro) +{ + ec_cbk_data_t *cbk; + int32_t err; + + cbk = fop->answer; + if (cbk == NULL) { + ec_fop_set_error(fop, EIO); + + return NULL; + } + + if (cbk->op_ret < 0) { + ec_fop_set_error(fop, cbk->op_errno); + } + + err = ec_dict_combine(cbk, EC_COMBINE_XDATA); + if (ec_cbk_set_error(cbk, -err, ro)) { + return NULL; + } + + return cbk; +} + +void +ec_sleep(ec_fop_data_t *fop) +{ + LOCK(&fop->lock); + + GF_ASSERT(fop->refs > 0); + fop->refs++; + fop->jobs++; + + UNLOCK(&fop->lock); +} + +int32_t +ec_check_complete(ec_fop_data_t *fop, ec_resume_f resume) +{ + int32_t error = -1; + + LOCK(&fop->lock); + + GF_ASSERT(fop->resume == NULL); + + if (--fop->jobs != 0) { + ec_trace("WAIT", fop, "resume=%p", resume); + + fop->resume = resume; + } else { + error = fop->error; + fop->error = 0; + } + + UNLOCK(&fop->lock); + + return error; +} + +void +ec_resume(ec_fop_data_t *fop, int32_t error) +{ + ec_resume_f resume = NULL; + + LOCK(&fop->lock); + + __ec_fop_set_error(fop, error); + + if (--fop->jobs == 0) { + resume = fop->resume; + fop->resume = NULL; + if (resume != NULL) { + ec_trace("RESUME", fop, "error=%d", error); + + if (fop->error != 0) { + error = fop->error; + } + fop->error = 0; + } + } + + UNLOCK(&fop->lock); + + if (resume != NULL) { + resume(fop, error); + } + + ec_fop_data_release(fop); +} + +void +ec_resume_parent(ec_fop_data_t *fop) +{ + ec_fop_data_t *parent; + int32_t error = 0; + + parent = fop->parent; + if (parent != NULL) { + if ((fop->fop_flags & EC_FOP_NO_PROPAGATE_ERROR) == 0) { + error = fop->error; + } + ec_trace("RESUME_PARENT", fop, "error=%u", error); + fop->parent = NULL; + ec_resume(parent, error); + } +} + +gf_boolean_t +ec_is_recoverable_error(int32_t op_errno) +{ + switch (op_errno) { + case ENOTCONN: + case ESTALE: + case ENOENT: + case EBADFD: /*Opened fd but brick is disconnected*/ + case EIO: /*Backend-fs crash like XFS/ext4 etc*/ + return _gf_true; + } + return _gf_false; +} + +void +ec_complete(ec_fop_data_t *fop) +{ + ec_cbk_data_t *cbk = NULL; + int32_t resume = 0, update = 0; + int healing_count = 0; + + LOCK(&fop->lock); + + ec_trace("COMPLETE", fop, ""); + + if (--fop->winds == 0) { + if (fop->answer == NULL) { + if (!list_empty(&fop->cbk_list)) { + cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list); + healing_count = gf_bits_count(cbk->mask & fop->healing); + /* fop shouldn't be treated as success if it is not + * successful on at least fop->minimum good copies*/ + if ((cbk->count - healing_count) >= fop->minimum) { + fop->answer = cbk; + + update = 1; + } + } + + resume = 1; + } + } + + UNLOCK(&fop->lock); + + /* ec_update_good() locks inode->lock. This may cause deadlocks with + fop->lock when used in another order. Since ec_update_good() will not + be called more than once for each fop, it can be called from outside + the fop->lock locked region. */ + if (update) { + ec_update_good(fop, cbk->mask); + } + + if (resume) { + ec_resume(fop, 0); + } + + ec_fop_data_release(fop); +} + +/* There could be already granted locks sitting on the bricks, unlock for which + * must be wound at all costs*/ +static gf_boolean_t +ec_must_wind(ec_fop_data_t *fop) +{ + if ((fop->id == GF_FOP_INODELK) || (fop->id == GF_FOP_FINODELK) || + (fop->id == GF_FOP_LK)) { + if (fop->flock.l_type == F_UNLCK) + return _gf_true; + } else if ((fop->id == GF_FOP_ENTRYLK) || (fop->id == GF_FOP_FENTRYLK)) { + if (fop->entrylk_cmd == ENTRYLK_UNLOCK) + return _gf_true; + } + + return _gf_false; +} + +static gf_boolean_t +ec_internal_op(ec_fop_data_t *fop) +{ + if (ec_must_wind(fop)) + return _gf_true; + if (fop->id == GF_FOP_XATTROP) + return _gf_true; + if (fop->id == GF_FOP_FXATTROP) + return _gf_true; + if (fop->id == GF_FOP_OPEN) + return _gf_true; + return _gf_false; +} + +char * +ec_msg_str(ec_fop_data_t *fop) +{ + loc_t *loc1 = NULL; + loc_t *loc2 = NULL; + char gfid1[64] = {0}; + char gfid2[64] = {0}; + ec_fop_data_t *parent = fop->parent; + + if (fop->errstr) + return fop->errstr; + if (!fop->use_fd) { + loc1 = &fop->loc[0]; + loc2 = &fop->loc[1]; + + if (fop->id == GF_FOP_RENAME) { + gf_asprintf(&fop->errstr, + "FOP : '%s' failed on '%s' and '%s' with gfids " + "%s and %s respectively. Parent FOP: %s", + ec_fop_name(fop->id), loc1->path, loc2->path, + uuid_utoa_r(loc1->gfid, gfid1), + uuid_utoa_r(loc2->gfid, gfid2), + parent ? ec_fop_name(parent->id) : "No Parent"); + } else { + gf_asprintf( + &fop->errstr, + "FOP : '%s' failed on '%s' with gfid %s. Parent FOP: %s", + ec_fop_name(fop->id), loc1->path, + uuid_utoa_r(loc1->gfid, gfid1), + parent ? ec_fop_name(parent->id) : "No Parent"); + } + } else { + gf_asprintf( + &fop->errstr, "FOP : '%s' failed on gfid %s. Parent FOP: %s", + ec_fop_name(fop->id), uuid_utoa_r(fop->fd->inode->gfid, gfid1), + parent ? ec_fop_name(parent->id) : "No Parent"); + } + return fop->errstr; +} + +static void +ec_log_insufficient_vol(ec_fop_data_t *fop, int32_t have, uint32_t need, + int32_t loglevel) +{ + ec_t *ec = fop->xl->private; + char str1[32], str2[32], str3[32]; + + gf_msg(ec->xl->name, loglevel, 0, EC_MSG_CHILDS_INSUFFICIENT, + "Insufficient available children for this request: " + "Have : %d, Need : %u : Child UP : %s " + "Mask: %s, Healing : %s : %s ", + have, need, ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), + ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), + ec_bin(str3, sizeof(str3), fop->healing, ec->nodes), + ec_msg_str(fop)); +} + +static int32_t +ec_child_select(ec_fop_data_t *fop) +{ + ec_t *ec = fop->xl->private; + int32_t first = 0, num = 0; + + ec_fop_cleanup(fop); + + fop->mask &= ec->node_mask; + /* Wind the fop on same subvols as parent for any internal extra fops like + * head/tail read in case of writev fop. Unlocks shouldn't do this because + * unlock should go on all subvols where lock is performed*/ + if (fop->parent && !ec_internal_op(fop)) { + fop->mask &= (fop->parent->mask & ~fop->parent->healing); + if (ec_is_data_fop(fop->id)) { + fop->healing |= fop->parent->healing; + } + } + + if ((fop->mask & ~ec->xl_up) != 0) { + gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_EXEC_UNAVAIL, + "Executing operation with " + "some subvolumes unavailable. (%" PRIXPTR "). %s ", + fop->mask & ~ec->xl_up, ec_msg_str(fop)); + fop->mask &= ec->xl_up; + } + + switch (fop->minimum) { + case EC_MINIMUM_ALL: + fop->minimum = gf_bits_count(fop->mask); + if (fop->minimum >= ec->fragments) { + break; + } + case EC_MINIMUM_MIN: + fop->minimum = ec->fragments; + break; + case EC_MINIMUM_ONE: + fop->minimum = 1; + } + + if (ec->read_policy == EC_ROUND_ROBIN) { + first = ec->idx; + if (++first >= ec->nodes) { + first = 0; + } + ec->idx = first; + } + + num = gf_bits_count(fop->mask); + /*Unconditionally wind on healing subvolumes*/ + fop->mask |= fop->healing; + fop->remaining = fop->mask; + fop->received = 0; + + ec_trace("SELECT", fop, ""); + + if ((num < fop->minimum) && (num < ec->fragments)) { + ec_log_insufficient_vol(fop, num, fop->minimum, GF_LOG_ERROR); + return 0; + } + + if (!fop->parent && fop->lock_count && + (fop->locks[0].update[EC_DATA_TXN] || + fop->locks[0].update[EC_METADATA_TXN])) { + if (ec->quorum_count && (num < ec->quorum_count)) { + ec_log_insufficient_vol(fop, num, ec->quorum_count, GF_LOG_ERROR); + return 0; + } + } + + return 1; +} + +void +ec_dispatch_next(ec_fop_data_t *fop, uint32_t idx) +{ + uint32_t i = EC_INVALID_INDEX; + ec_t *ec = fop->xl->private; + + LOCK(&fop->lock); + + i = ec_child_next(ec, fop, idx); + if (i < EC_MAX_NODES) { + idx = i; + + fop->remaining ^= 1ULL << idx; + + ec_trace("EXECUTE", fop, "idx=%d", idx); + + fop->winds++; + fop->refs++; + } + + UNLOCK(&fop->lock); + + if (i < EC_MAX_NODES) { + fop->wind(ec, fop, idx); + } +} + +void +ec_dispatch_mask(ec_fop_data_t *fop, uintptr_t mask) +{ + ec_t *ec = fop->xl->private; + int32_t count, idx; + + count = gf_bits_count(mask); + + LOCK(&fop->lock); + + ec_trace("EXECUTE", fop, "mask=%lX", mask); + + fop->remaining ^= mask; + + fop->winds += count; + fop->refs += count; + + UNLOCK(&fop->lock); + + idx = 0; + while (mask != 0) { + if ((mask & 1) != 0) { + fop->wind(ec, fop, idx); + } + idx++; + mask >>= 1; + } +} + +void +ec_dispatch_start(ec_fop_data_t *fop) +{ + fop->answer = NULL; + fop->good = 0; + + INIT_LIST_HEAD(&fop->cbk_list); + + if (fop->lock_count > 0) { + ec_owner_copy(fop->frame, &fop->req_frame->root->lk_owner); + } +} + +void +ec_dispatch_one(ec_fop_data_t *fop) +{ + ec_dispatch_start(fop); + + if (ec_child_select(fop)) { + ec_sleep(fop); + + fop->expected = 1; + fop->first = ec_select_first_by_read_policy(fop->xl->private, fop); + + ec_dispatch_next(fop, fop->first); + } +} + +gf_boolean_t +ec_dispatch_one_retry(ec_fop_data_t *fop, ec_cbk_data_t **cbk) +{ + ec_cbk_data_t *tmp; + + tmp = ec_fop_prepare_answer(fop, _gf_true); + if (cbk != NULL) { + *cbk = tmp; + } + if ((tmp != NULL) && (tmp->op_ret < 0) && + ec_is_recoverable_error(tmp->op_errno)) { + GF_ASSERT(fop->mask & (1ULL << tmp->idx)); + fop->mask ^= (1ULL << tmp->idx); + if (fop->mask) { + return _gf_true; + } + } + + return _gf_false; +} + +void +ec_dispatch_inc(ec_fop_data_t *fop) +{ + ec_dispatch_start(fop); + + if (ec_child_select(fop)) { + ec_sleep(fop); + + fop->expected = gf_bits_count(fop->remaining); + fop->first = 0; + + ec_dispatch_next(fop, 0); + } +} + +void +ec_dispatch_all(ec_fop_data_t *fop) +{ + ec_dispatch_start(fop); + + if (ec_child_select(fop)) { + ec_sleep(fop); + + fop->expected = gf_bits_count(fop->remaining); + fop->first = 0; + + ec_dispatch_mask(fop, fop->remaining); + } +} + +void +ec_dispatch_min(ec_fop_data_t *fop) +{ + ec_t *ec = fop->xl->private; + uintptr_t mask; + uint32_t idx; + int32_t count; + + ec_dispatch_start(fop); + + if (ec_child_select(fop)) { + ec_sleep(fop); + + fop->expected = count = ec->fragments; + fop->first = ec_select_first_by_read_policy(fop->xl->private, fop); + idx = fop->first - 1; + mask = 0; + while (count-- > 0) { + idx = ec_child_next(ec, fop, idx + 1); + if (idx < EC_MAX_NODES) + mask |= 1ULL << idx; + } + + ec_dispatch_mask(fop, mask); + } +} + +void +ec_succeed_all(ec_fop_data_t *fop) +{ + ec_dispatch_start(fop); + + if (ec_child_select(fop)) { + fop->expected = gf_bits_count(fop->remaining); + fop->first = 0; + + /* Simulate a successful execution on all bricks */ + ec_trace("SUCCEED", fop, ""); + + fop->good = fop->remaining; + fop->remaining = 0; + } +} + +ec_lock_t * +ec_lock_allocate(ec_fop_data_t *fop, loc_t *loc) +{ + ec_t *ec = fop->xl->private; + ec_lock_t *lock; + int32_t err; + + if ((loc->inode == NULL) || + (gf_uuid_is_null(loc->gfid) && gf_uuid_is_null(loc->inode->gfid))) { + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_INODE, + "Trying to lock based on an invalid " + "inode"); + + __ec_fop_set_error(fop, EINVAL); + + return NULL; + } + + lock = mem_get0(ec->lock_pool); + if (lock != NULL) { + lock->good_mask = UINTPTR_MAX; + INIT_LIST_HEAD(&lock->owners); + INIT_LIST_HEAD(&lock->waiting); + INIT_LIST_HEAD(&lock->frozen); + err = ec_loc_from_loc(fop->xl, &lock->loc, loc); + if (err != 0) { + mem_put(lock); + lock = NULL; + + __ec_fop_set_error(fop, -err); + } + } + + return lock; +} + +void +ec_lock_destroy(ec_lock_t *lock) +{ + loc_wipe(&lock->loc); + if (lock->fd != NULL) { + fd_unref(lock->fd); + } + + mem_put(lock); +} + +int32_t +ec_lock_compare(ec_lock_t *lock1, ec_lock_t *lock2) +{ + return gf_uuid_compare(lock1->loc.gfid, lock2->loc.gfid); +} + +static void +ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags, loc_t *base, + off_t fl_start, uint64_t fl_size) +{ + ec_lock_link_t *link; + + /* This check is only prepared for up to 2 locks per fop. If more locks + * are needed this must be changed. */ + if ((fop->lock_count > 0) && + (ec_lock_compare(fop->locks[0].lock, lock) < 0)) { + fop->first_lock = fop->lock_count; + } else { + /* When the first lock is added to the current fop, request lock + * counts from locks xlator to be able to determine if there is + * contention and release the lock sooner. */ + if (fop->xdata == NULL) { + fop->xdata = dict_new(); + if (fop->xdata == NULL) { + ec_fop_set_error(fop, ENOMEM); + return; + } + } + if (dict_set_str(fop->xdata, GLUSTERFS_INODELK_DOM_COUNT, + fop->xl->name) != 0) { + ec_fop_set_error(fop, ENOMEM); + return; + } + } + + link = &fop->locks[fop->lock_count++]; + + link->lock = lock; + link->fop = fop; + link->update[EC_DATA_TXN] = (flags & EC_UPDATE_DATA) != 0; + link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0; + link->base = base; + link->fl_start = fl_start; + link->fl_end = ec_range_end_get(fl_start, fl_size); + + lock->refs_pending++; +} + +static void +ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc, uint32_t flags, + loc_t *base, off_t fl_start, uint64_t fl_size) +{ + ec_lock_t *lock = NULL; + ec_inode_t *ctx; + + if ((fop->parent != NULL) || (fop->error != 0) || (loc->inode == NULL)) { + return; + } + + LOCK(&loc->inode->lock); + + ctx = __ec_inode_get(loc->inode, fop->xl); + if (ctx == NULL) { + __ec_fop_set_error(fop, ENOMEM); + + goto unlock; + } + + if (ctx->inode_lock != NULL) { + lock = ctx->inode_lock; + + /* If there's another lock, make sure that it's not the same. Otherwise + * do not insert it. + * + * This can only happen on renames where source and target names are + * in the same directory. */ + if ((fop->lock_count > 0) && (fop->locks[0].lock == lock)) { + /* Combine data/meta updates */ + fop->locks[0].update[EC_DATA_TXN] |= (flags & EC_UPDATE_DATA) != 0; + fop->locks[0].update[EC_METADATA_TXN] |= (flags & EC_UPDATE_META) != + 0; + + /* Only one base inode is allowed per fop, so there shouldn't be + * overwrites here. */ + if (base != NULL) { + fop->locks[0].base = base; + } + + goto update_query; + } + + ec_trace("LOCK_INODELK", fop, + "lock=%p, inode=%p. Lock already " + "acquired", + lock, loc->inode); + + goto insert; + } + + lock = ec_lock_allocate(fop, loc); + if (lock == NULL) { + goto unlock; + } + + ec_trace("LOCK_CREATE", fop, "lock=%p", lock); + + lock->flock.l_type = F_WRLCK; + lock->flock.l_whence = SEEK_SET; + + lock->ctx = ctx; + ctx->inode_lock = lock; + +insert: + ec_lock_insert(fop, lock, flags, base, fl_start, fl_size); +update_query: + lock->query |= (flags & EC_QUERY_INFO) != 0; +unlock: + UNLOCK(&loc->inode->lock); +} + +void +ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags, + off_t fl_start, uint64_t fl_size) +{ + ec_lock_prepare_inode_internal(fop, loc, flags, NULL, fl_start, fl_size); +} + +void +ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base, + uint32_t flags) +{ + loc_t tmp; + int32_t err; + + if (fop->error != 0) { + return; + } + + err = ec_loc_parent(fop->xl, loc, &tmp); + if (err != 0) { + ec_fop_set_error(fop, -err); + + return; + } + + if ((flags & EC_INODE_SIZE) != 0) { + flags ^= EC_INODE_SIZE; + } else { + base = NULL; + } + + ec_lock_prepare_inode_internal(fop, &tmp, flags, base, 0, EC_RANGE_FULL); + + loc_wipe(&tmp); +} + +void +ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags, off_t fl_start, + uint64_t fl_size) +{ + loc_t loc; + int32_t err; + + if (fop->error != 0) { + return; + } + + err = ec_loc_from_fd(fop->xl, &loc, fd); + if (err != 0) { + ec_fop_set_error(fop, -err); + + return; + } + + ec_lock_prepare_inode_internal(fop, &loc, flags, NULL, fl_start, fl_size); + + loc_wipe(&loc); +} + +gf_boolean_t +ec_config_check(xlator_t *xl, ec_config_t *config) +{ + ec_t *ec; + + ec = xl->private; + if ((config->version != EC_CONFIG_VERSION) || + (config->algorithm != EC_CONFIG_ALGORITHM) || + (config->gf_word_size != EC_GF_BITS) || (config->bricks != ec->nodes) || + (config->redundancy != ec->redundancy) || + (config->chunk_size != EC_METHOD_CHUNK_SIZE)) { + uint32_t data_bricks; + + /* This combination of version/algorithm requires the following + values. Incorrect values for these fields are a sign of + corruption: + + redundancy > 0 + redundancy * 2 < bricks + gf_word_size must be a power of 2 + chunk_size (in bits) must be a multiple of gf_word_size * + (bricks - redundancy) */ + + data_bricks = config->bricks - config->redundancy; + if ((config->redundancy < 1) || + (config->redundancy * 2 >= config->bricks) || + !ec_is_power_of_2(config->gf_word_size) || + ((config->chunk_size * 8) % (config->gf_word_size * data_bricks) != + 0)) { + gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_CONFIG, + "Invalid or corrupted config"); + } else { + gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_CONFIG, + "Unsupported config " + "(V=%u, A=%u, W=%u, " + "N=%u, R=%u, S=%u)", + config->version, config->algorithm, config->gf_word_size, + config->bricks, config->redundancy, config->chunk_size); + } + + return _gf_false; + } + + return _gf_true; +} + +gf_boolean_t +ec_set_dirty_flag(ec_lock_link_t *link, ec_inode_t *ctx, uint64_t *dirty) +{ + gf_boolean_t set_dirty = _gf_false; + + if (link->update[EC_DATA_TXN] && !ctx->dirty[EC_DATA_TXN]) { + if (!link->optimistic_changelog) + dirty[EC_DATA_TXN] = 1; + } + + if (link->update[EC_METADATA_TXN] && !ctx->dirty[EC_METADATA_TXN]) { + if (!link->optimistic_changelog) + dirty[EC_METADATA_TXN] = 1; + } + + if (dirty[EC_METADATA_TXN] || dirty[EC_DATA_TXN]) { + set_dirty = _gf_true; + } + + return set_dirty; +} + +int32_t +ec_prepare_update_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + struct list_head list; + ec_fop_data_t *fop = cookie, *parent, *tmp; + ec_lock_link_t *parent_link = fop->data; + ec_lock_link_t *link = NULL; + ec_lock_t *lock = NULL; + ec_inode_t *ctx; + gf_boolean_t release = _gf_false; + uint64_t provided_flags = 0; + uint64_t dirty[EC_VERSION_SIZE] = {0, 0}; + lock = parent_link->lock; + parent = parent_link->fop; + ctx = lock->ctx; + + INIT_LIST_HEAD(&list); + provided_flags = EC_PROVIDED_FLAGS(parent_link->waiting_flags); + + LOCK(&lock->loc.inode->lock); + + list_for_each_entry(link, &lock->owners, owner_list) + { + if ((link->waiting_flags & provided_flags) != 0) { + link->waiting_flags ^= (link->waiting_flags & provided_flags); + if (EC_NEEDED_FLAGS(link->waiting_flags) == 0) + list_add_tail(&link->fop->cbk_list, &list); + } + } + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_SIZE_VERS_GET_FAIL, + "Failed to get size and version : %s", ec_msg_str(fop)); + + goto unlock; + } + + if (EC_FLAGS_HAVE(provided_flags, EC_FLAG_XATTROP)) { + op_errno = -ec_dict_del_array(dict, EC_XATTR_VERSION, ctx->pre_version, + EC_VERSION_SIZE); + if (op_errno != 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + EC_MSG_VER_XATTR_GET_FAIL, "Unable to get version xattr. %s", + ec_msg_str(fop)); + goto unlock; + } + ctx->post_version[0] += ctx->pre_version[0]; + ctx->post_version[1] += ctx->pre_version[1]; + + ctx->have_version = _gf_true; + + if (lock->loc.inode->ia_type == IA_IFREG || + lock->loc.inode->ia_type == IA_INVAL) { + op_errno = -ec_dict_del_number(dict, EC_XATTR_SIZE, &ctx->pre_size); + if (op_errno != 0) { + if (lock->loc.inode->ia_type == IA_IFREG) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + EC_MSG_SIZE_XATTR_GET_FAIL, + "Unable to get size xattr. %s", ec_msg_str(fop)); + goto unlock; + } + } else { + ctx->post_size = ctx->pre_size; + + ctx->have_size = _gf_true; + } + + op_errno = -ec_dict_del_config(dict, EC_XATTR_CONFIG, &ctx->config); + if (op_errno != 0) { + if ((lock->loc.inode->ia_type == IA_IFREG) || + (op_errno != ENODATA)) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + EC_MSG_CONFIG_XATTR_GET_FAIL, + "Unable to get config xattr. %s", ec_msg_str(fop)); + + goto unlock; + } + } else { + if (!ec_config_check(parent->xl, &ctx->config)) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, + EC_MSG_CONFIG_XATTR_INVALID, "Invalid config xattr"); + + op_errno = EINVAL; + + goto unlock; + } + ctx->have_config = _gf_true; + } + } + ctx->have_info = _gf_true; + } + + ec_set_dirty_flag(fop->data, ctx, dirty); + if (dirty[EC_METADATA_TXN] && + (EC_FLAGS_HAVE(provided_flags, EC_FLAG_METADATA_DIRTY))) { + GF_ASSERT(!ctx->dirty[EC_METADATA_TXN]); + ctx->dirty[EC_METADATA_TXN] = 1; + } + + if (dirty[EC_DATA_TXN] && + (EC_FLAGS_HAVE(provided_flags, EC_FLAG_DATA_DIRTY))) { + GF_ASSERT(!ctx->dirty[EC_DATA_TXN]); + ctx->dirty[EC_DATA_TXN] = 1; + } + op_errno = 0; +unlock: + + lock->waiting_flags ^= provided_flags; + + if (op_errno == 0) { + /* If the fop fails on any of the good bricks, it is important to mark + * it dirty and update versions right away if dirty was not set before. + */ + if (lock->good_mask & ~(fop->good | fop->remaining)) { + release = _gf_true; + } + + if (parent_link->update[0] && !parent_link->dirty[0]) { + lock->release |= release; + } + + if (parent_link->update[1] && !parent_link->dirty[1]) { + lock->release |= release; + } + + /* We don't allow the main fop to be executed on bricks that have not + * succeeded the initial xattrop. */ + ec_lock_update_good(lock, fop); + + /*As of now only data healing marks bricks as healing*/ + lock->healing |= fop->healing; + } + + UNLOCK(&lock->loc.inode->lock); + + while (!list_empty(&list)) { + tmp = list_entry(list.next, ec_fop_data_t, cbk_list); + list_del_init(&tmp->cbk_list); + + if (op_errno == 0) { + tmp->mask &= fop->good; + + /*As of now only data healing marks bricks as healing*/ + if (ec_is_data_fop(tmp->id)) { + tmp->healing |= fop->healing; + } + } + + ec_resume(tmp, op_errno); + } + + return 0; +} + +static gf_boolean_t +ec_set_needed_flag(ec_lock_t *lock, ec_lock_link_t *link, uint64_t flag) +{ + uint64_t current; + + link->waiting_flags |= EC_FLAG_NEEDS(flag); + + current = EC_NEEDED_FLAGS(lock->waiting_flags); + if (!EC_FLAGS_HAVE(current, flag)) { + lock->waiting_flags |= EC_FLAG_NEEDS(flag); + link->waiting_flags |= EC_FLAG_PROVIDES(flag); + + return _gf_true; + } + + return _gf_false; +} + +static uint64_t +ec_set_xattrop_flags_and_params(ec_lock_t *lock, ec_lock_link_t *link, + uint64_t *dirty) +{ + uint64_t oldflags = 0; + uint64_t newflags = 0; + ec_inode_t *ctx = lock->ctx; + + oldflags = EC_NEEDED_FLAGS(lock->waiting_flags); + + if (lock->query && !ctx->have_info) { + ec_set_needed_flag(lock, link, EC_FLAG_XATTROP); + } + + if (dirty[EC_DATA_TXN]) { + if (!ec_set_needed_flag(lock, link, EC_FLAG_DATA_DIRTY)) { + dirty[EC_DATA_TXN] = 0; + } + } + + if (dirty[EC_METADATA_TXN]) { + if (!ec_set_needed_flag(lock, link, EC_FLAG_METADATA_DIRTY)) { + dirty[EC_METADATA_TXN] = 0; + } + } + newflags = EC_NEEDED_FLAGS(lock->waiting_flags); + + return oldflags ^ newflags; +} + +void +ec_get_size_version(ec_lock_link_t *link) +{ + loc_t loc; + ec_lock_t *lock; + ec_inode_t *ctx; + ec_fop_data_t *fop; + dict_t *dict = NULL; + dict_t *xdata = NULL; + ec_t *ec = NULL; + int32_t error = 0; + gf_boolean_t set_dirty = _gf_false; + uint64_t allzero[EC_VERSION_SIZE] = {0, 0}; + uint64_t dirty[EC_VERSION_SIZE] = {0, 0}; + lock = link->lock; + ctx = lock->ctx; + fop = link->fop; + ec = fop->xl->private; + uint64_t changed_flags = 0; + + if (ec->optimistic_changelog && !(ec->node_mask & ~link->lock->good_mask) && + !ec_is_data_fop(fop->id)) + link->optimistic_changelog = _gf_true; + + memset(&loc, 0, sizeof(loc)); + + LOCK(&lock->loc.inode->lock); + + set_dirty = ec_set_dirty_flag(link, ctx, dirty); + + /* If ec metadata has already been retrieved, do not try again. */ + if (ctx->have_info) { + if (ec_is_data_fop(fop->id)) { + fop->healing |= lock->healing; + } + if (!set_dirty) + goto unlock; + } + + /* Determine if there's something we need to retrieve for the current + * operation. */ + if (!set_dirty && !lock->query && (lock->loc.inode->ia_type != IA_IFREG) && + (lock->loc.inode->ia_type != IA_INVAL)) { + goto unlock; + } + + changed_flags = ec_set_xattrop_flags_and_params(lock, link, dirty); + if (link->waiting_flags) { + /* This fop needs to wait until all its flags are cleared which + * potentially can be cleared by other xattrops that are already + * wound*/ + ec_sleep(fop); + } else { + GF_ASSERT(!changed_flags); + } + +unlock: + UNLOCK(&lock->loc.inode->lock); + + if (!changed_flags) + goto out; + + dict = dict_new(); + if (dict == NULL) { + error = -ENOMEM; + goto out; + } + + if (EC_FLAGS_HAVE(changed_flags, EC_FLAG_XATTROP)) { + /* Once we know that an xattrop will be needed, + * we try to get all available information in a + * single call. */ + error = ec_dict_set_array(dict, EC_XATTR_VERSION, allzero, + EC_VERSION_SIZE); + if (error != 0) { + goto out; + } + + if (lock->loc.inode->ia_type == IA_IFREG || + lock->loc.inode->ia_type == IA_INVAL) { + error = ec_dict_set_number(dict, EC_XATTR_SIZE, 0); + if (error == 0) { + error = ec_dict_set_number(dict, EC_XATTR_CONFIG, 0); + } + if (error != 0) { + goto out; + } + + xdata = dict_new(); + if (xdata == NULL || dict_set_int32(xdata, GF_GET_SIZE, 1)) { + error = -ENOMEM; + goto out; + } + } + } + + if (memcmp(allzero, dirty, sizeof(allzero))) { + error = ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE); + if (error != 0) { + goto out; + } + } + + fop->frame->root->uid = 0; + fop->frame->root->gid = 0; + + /* For normal fops, ec_[f]xattrop() must succeed on at least + * EC_MINIMUM_MIN bricks, however when this is called as part of a + * self-heal operation the mask of target bricks (fop->mask) could + * contain less than EC_MINIMUM_MIN bricks, causing the xattrop to + * always fail. Thus we always use the same minimum used for the main + * fop. + */ + if (lock->fd == NULL) { + error = ec_loc_from_loc(fop->xl, &loc, &lock->loc); + if (error != 0) { + goto out; + } + if (gf_uuid_is_null(loc.pargfid)) { + if (loc.parent != NULL) { + inode_unref(loc.parent); + loc.parent = NULL; + } + GF_FREE((char *)loc.path); + loc.path = NULL; + loc.name = NULL; + } + + ec_xattrop(fop->frame, fop->xl, fop->mask, fop->minimum, + ec_prepare_update_cbk, link, &loc, GF_XATTROP_ADD_ARRAY64, + dict, xdata); + } else { + ec_fxattrop(fop->frame, fop->xl, fop->mask, fop->minimum, + ec_prepare_update_cbk, link, lock->fd, + GF_XATTROP_ADD_ARRAY64, dict, xdata); + } + + error = 0; + +out: + fop->frame->root->uid = fop->uid; + fop->frame->root->gid = fop->gid; + + loc_wipe(&loc); + + if (dict != NULL) { + dict_unref(dict); + } + + if (xdata != NULL) { + dict_unref(xdata); + } + + if (error != 0) { + ec_fop_set_error(fop, -error); + } +} + +gf_boolean_t +__ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size) +{ + ec_inode_t *ctx; + gf_boolean_t found = _gf_false; + + ctx = __ec_inode_get(inode, fop->xl); + if (ctx == NULL) { + goto out; + } + + if (ctx->have_size) { + *size = ctx->post_size; + found = _gf_true; + } + +out: + return found; +} + +gf_boolean_t +ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size) +{ + gf_boolean_t found = _gf_false; + + LOCK(&inode->lock); + { + found = __ec_get_inode_size(fop, inode, size); + } + UNLOCK(&inode->lock); + + return found; +} + +gf_boolean_t +__ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t size) +{ + ec_inode_t *ctx; + gf_boolean_t found = _gf_false; + + ctx = __ec_inode_get(inode, fop->xl); + if (ctx == NULL) { + goto out; + } + + /* Normal fops always have ctx->have_size set. However self-heal calls this + * to prepare the inode, so ctx->have_size will be false. In this case we + * prepare both pre_size and post_size, and set have_size and have_info to + * true. */ + if (!ctx->have_size) { + ctx->pre_size = size; + ctx->have_size = ctx->have_info = _gf_true; + } + ctx->post_size = size; + + found = _gf_true; + +out: + return found; +} + +gf_boolean_t +ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t size) +{ + gf_boolean_t found = _gf_false; + + LOCK(&inode->lock); + { + found = __ec_set_inode_size(fop, inode, size); + } + UNLOCK(&inode->lock); + + return found; +} + +static void +ec_release_stripe_cache(ec_inode_t *ctx) +{ + ec_stripe_list_t *stripe_cache = NULL; + ec_stripe_t *stripe = NULL; + + stripe_cache = &ctx->stripe_cache; + while (!list_empty(&stripe_cache->lru)) { + stripe = list_first_entry(&stripe_cache->lru, ec_stripe_t, lru); + list_del(&stripe->lru); + GF_FREE(stripe); + } + stripe_cache->count = 0; + stripe_cache->max = 0; +} + +void +ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode) +{ + ec_inode_t *ctx; + + LOCK(&inode->lock); + + ctx = __ec_inode_get(inode, fop->xl); + if (ctx == NULL) { + goto unlock; + } + + ec_release_stripe_cache(ctx); + ctx->have_info = _gf_false; + ctx->have_config = _gf_false; + ctx->have_version = _gf_false; + ctx->have_size = _gf_false; + + memset(&ctx->config, 0, sizeof(ctx->config)); + memset(ctx->pre_version, 0, sizeof(ctx->pre_version)); + memset(ctx->post_version, 0, sizeof(ctx->post_version)); + ctx->pre_size = ctx->post_size = 0; + memset(ctx->dirty, 0, sizeof(ctx->dirty)); + +unlock: + UNLOCK(&inode->lock); +} + +int32_t +ec_get_real_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + ec_fop_data_t *fop = cookie; + ec_lock_link_t *link; + + if (op_ret >= 0) { + link = fop->data; + link->size = buf->ia_size; + } else { + /* Prevent failure of parent fop. */ + fop->error = 0; + } + + return 0; +} + +/* This function is used to get the trusted.ec.size xattr from a file when + * no lock is needed on the inode. This is only required to maintain iatt + * structs on fops that manipulate directory entries but do not operate + * directly on the inode, like link, rename, ... + * + * Any error processing this request is ignored. In the worst case, an invalid + * or not up to date value in the iatt could cause some cache invalidation. + */ +void +ec_get_real_size(ec_lock_link_t *link) +{ + ec_fop_data_t *fop; + dict_t *xdata; + + if (link->base == NULL || link->base->inode == NULL) { + return; + } + + if (link->base->inode->ia_type != IA_IFREG) { + return; + } + + fop = link->fop; + + if (ec_get_inode_size(fop, link->base->inode, &link->size)) { + return; + } + + xdata = dict_new(); + if (xdata == NULL) { + return; + } + if (ec_dict_set_number(xdata, EC_XATTR_SIZE, 0) != 0) { + goto out; + } + + /* Send a simple lookup. A single answer is considered ok since this value + * is only used to return an iatt struct related to an inode that is not + * locked and have not suffered any operation. */ + ec_lookup(fop->frame, fop->xl, fop->mask, 1, ec_get_real_size_cbk, link, + link->base, xdata); + +out: + if (xdata != NULL) { + dict_unref(xdata); + } +} + +static void +ec_lock_update_fd(ec_lock_t *lock, ec_fop_data_t *fop) +{ + /* If the fop has an fd available, attach it to the lock structure to be + * able to do fxattrop calls instead of xattrop. */ + if (fop->use_fd && (lock->fd == NULL)) { + lock->fd = __fd_ref(fop->fd); + } +} + +static gf_boolean_t +ec_link_has_lock_conflict(ec_lock_link_t *link, gf_boolean_t waitlist_check) +{ + ec_lock_link_t *trav_link = NULL; + + list_for_each_entry(trav_link, &link->lock->owners, owner_list) + { + if (ec_lock_conflict(trav_link, link)) + return _gf_true; + } + + if (!waitlist_check) + return _gf_false; + + list_for_each_entry(trav_link, &link->lock->waiting, wait_list) + { + if (ec_lock_conflict(trav_link, link)) + return _gf_true; + } + + return _gf_false; +} + +static void +ec_lock_wake_shared(ec_lock_t *lock, struct list_head *list) +{ + ec_fop_data_t *fop; + ec_lock_link_t *link; + gf_boolean_t conflict = _gf_false; + + while (!conflict && !list_empty(&lock->waiting)) { + link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list); + fop = link->fop; + + /* If lock is not acquired, at most one fop can be assigned as owner. + * The following fops will need to wait in the lock->waiting queue + * until the lock has been fully acquired. */ + conflict = !lock->acquired; + + /* If the fop is not shareable, only this fop can be assigned as owner. + * Other fops will need to wait until this one finishes. */ + if (ec_link_has_lock_conflict(link, _gf_false)) { + conflict = _gf_true; + } + + /* If only one fop is allowed, it can be assigned as the owner of the + * lock only if there weren't any other owner. */ + if (conflict && !list_empty(&lock->owners)) { + break; + } + + list_move_tail(&link->wait_list, list); + + list_add_tail(&link->owner_list, &lock->owners); + lock->refs_owners++; + + ec_lock_update_fd(lock, fop); + } +} + +static void +ec_lock_apply(ec_lock_link_t *link) +{ + ec_fop_data_t *fop = link->fop; + + fop->mask &= link->lock->good_mask; + fop->locked++; + + ec_get_size_version(link); + ec_get_real_size(link); +} + +gf_boolean_t +ec_lock_acquire(ec_lock_link_t *link); + +static void +ec_lock_resume_shared(struct list_head *list) +{ + ec_lock_link_t *link; + + while (!list_empty(list)) { + link = list_entry(list->next, ec_lock_link_t, wait_list); + list_del_init(&link->wait_list); + + if (link->lock->acquired) { + ec_lock_apply(link); + ec_lock(link->fop); + } else { + GF_ASSERT(list_empty(list)); + + ec_lock_acquire(link); + } + + ec_resume(link->fop, 0); + } +} + +void +ec_lock_acquired(ec_lock_link_t *link) +{ + struct list_head list; + ec_lock_t *lock; + ec_fop_data_t *fop; + + lock = link->lock; + fop = link->fop; + + ec_trace("LOCKED", fop, "lock=%p", lock); + + INIT_LIST_HEAD(&list); + + LOCK(&lock->loc.inode->lock); + + lock->acquired = _gf_true; + if (lock->contention) { + lock->release = _gf_true; + lock->contention = _gf_false; + } + + ec_lock_update_fd(lock, fop); + ec_lock_wake_shared(lock, &list); + + UNLOCK(&lock->loc.inode->lock); + + ec_lock_apply(link); + + if (fop->use_fd && + (link->update[EC_DATA_TXN] || link->update[EC_METADATA_TXN])) { + /* Try to reopen closed fd's only if lock has succeeded. */ + ec_fix_open(fop, lock->mask); + } + + ec_lock_resume_shared(&list); +} + +int32_t +ec_locked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + ec_lock_link_t *link = NULL; + ec_lock_t *lock = NULL; + + link = fop->data; + lock = link->lock; + if (op_ret >= 0) { + lock->mask = lock->good_mask = fop->good; + lock->healing = 0; + + ec_lock_acquired(link); + ec_lock(fop->parent); + } else { + LOCK(&lock->loc.inode->lock); + { + lock->contention = _gf_false; + } + UNLOCK(&lock->loc.inode->lock); + gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_PREOP_LOCK_FAILED, + "Failed to complete preop lock"); + } + + return 0; +} + +gf_boolean_t +ec_lock_acquire(ec_lock_link_t *link) +{ + ec_lock_t *lock; + ec_fop_data_t *fop; + gf_lkowner_t lk_owner; + + lock = link->lock; + fop = link->fop; + + if (!lock->acquired) { + set_lk_owner_from_ptr(&lk_owner, lock); + + ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p", lock, + lock->loc.inode); + + lock->flock.l_type = F_WRLCK; + ec_inodelk(fop->frame, fop->xl, &lk_owner, -1, EC_MINIMUM_ALL, + ec_locked, link, fop->xl->name, &lock->loc, F_SETLKW, + &lock->flock, NULL); + + return _gf_false; + } + + ec_trace("LOCK_REUSE", fop, "lock=%p", lock); + + ec_lock_acquired(link); + + return _gf_true; +} + +static ec_lock_link_t * +ec_lock_timer_cancel(xlator_t *xl, ec_lock_t *lock) +{ + ec_lock_link_t *timer_link; + + /* If we don't have any timer, there's nothing to cancel. */ + if (lock->timer == NULL) { + return NULL; + } + + /* We are trying to access a lock that has an unlock timer active. + * This means that the lock must be idle, i.e. no fop can be in the + * owner, waiting or frozen lists. It also means that the lock cannot + * have been marked as being released (this is done without timers). + * There should only be one owner reference, but it's possible that + * some fops are being prepared to use this lock. */ + GF_ASSERT((lock->refs_owners == 1) && list_empty(&lock->owners) && + list_empty(&lock->waiting)); + + /* We take the timer_link before cancelling the timer, since a + * successful cancellation will destroy it. It must not be NULL + * because it references the fop responsible for the delayed unlock + * that we are currently trying to cancel. */ + timer_link = lock->timer->data; + GF_ASSERT(timer_link != NULL); + + if (gf_timer_call_cancel(xl->ctx, lock->timer) < 0) { + /* It's too late to avoid the execution of the timer callback. + * Since we need to be sure that the callback has access to all + * needed resources, we cannot resume the execution of the + * timer fop now. This will be done in the callback. */ + timer_link = NULL; + } else { + /* The timer has been cancelled. The fop referenced by + * timer_link holds the last reference. The caller is + * responsible to release it when not needed anymore. */ + ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock); + } + + /* We have two options here: + * + * 1. The timer has been successfully cancelled. + * + * This is the easiest case and we can continue with the currently + * acquired lock. + * + * 2. The timer callback has already been fired. + * + * In this case we have not been able to cancel the timer before + * the timer callback has been fired, but we also know that + * lock->timer != NULL. This means that the timer callback is still + * trying to acquire the inode mutex that we currently own. We are + * safe until we release it. In this case we can safely clear + * lock->timer. This will cause that the timer callback does nothing + * once it acquires the mutex. + */ + lock->timer = NULL; + + return timer_link; +} + +static gf_boolean_t +ec_lock_assign_owner(ec_lock_link_t *link) +{ + ec_fop_data_t *fop; + ec_lock_t *lock; + ec_lock_link_t *timer_link = NULL; + gf_boolean_t assigned = _gf_false; + + /* The link cannot be in any list because we have just finished preparing + * it. */ + GF_ASSERT(list_empty(&link->wait_list)); + + fop = link->fop; + lock = link->lock; + + LOCK(&lock->loc.inode->lock); + + /* Since the link has just been prepared but it's not active yet, the + * refs_pending must be one at least (the ref owned by this link). */ + GF_ASSERT(lock->refs_pending > 0); + /* The link is not pending any more. It will be assigned to the owner, + * waiting or frozen list. */ + lock->refs_pending--; + + if (lock->release) { + ec_trace("LOCK_QUEUE_FREEZE", fop, "lock=%p", lock); + + /* When lock->release is set, we'll unlock the lock as soon as + * possible, meaning that we won't use a timer. */ + GF_ASSERT(lock->timer == NULL); + + /* The lock is marked to be released. We can still have owners and fops + * in the waiting ilist f they have been added before the lock has been + * marked to be released. However new fops are put into the frozen list + * to wait for the next unlock/lock cycle. */ + list_add_tail(&link->wait_list, &lock->frozen); + + goto unlock; + } + + /* The lock is not marked to be released, so the frozen list should be + * empty. */ + GF_ASSERT(list_empty(&lock->frozen)); + + timer_link = ec_lock_timer_cancel(fop->xl, lock); + + if (!list_empty(&lock->owners)) { + /* There are other owners of this lock. We can only take ownership if + * the lock is already acquired and doesn't have conflict with existing + * owners, or waiters(to prevent starvation). + * Otherwise we need to wait. + */ + if (!lock->acquired || ec_link_has_lock_conflict(link, _gf_true)) { + ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock); + + list_add_tail(&link->wait_list, &lock->waiting); + + goto unlock; + } + } + + list_add_tail(&link->owner_list, &lock->owners); + + /* If timer_link is not NULL, it means that we have inherited the owner + * reference assigned to the timer fop. In this case we simply reuse it. + * Otherwise we need to increase the number of owners. */ + if (timer_link == NULL) { + lock->refs_owners++; + } + + assigned = _gf_true; + +unlock: + if (!assigned) { + /* We have not been able to take ownership of this lock. The fop must + * be put to sleep. */ + ec_sleep(fop); + } + + UNLOCK(&lock->loc.inode->lock); + + /* If we have cancelled the timer, we need to resume the fop that was + * waiting for it. */ + if (timer_link != NULL) { + ec_resume(timer_link->fop, 0); + } + + return assigned; +} + +static void +ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk, + gf_boolean_t release) +{ + struct list_head list; + ec_lock_t *lock = link->lock; + ec_fop_data_t *fop = link->fop; + ec_inode_t *ctx = lock->ctx; + + INIT_LIST_HEAD(&list); + + LOCK(&lock->loc.inode->lock); + + ec_trace("LOCK_DONE", fop, "lock=%p", lock); + + /* Current link must belong to the owner list of the lock. We don't + * decrement lock->refs_owners here because the inode mutex is released + * before ec_unlock() is called and we need to know when the last owner + * unlocks the lock to do proper cleanup. lock->refs_owners is used for + * this task. */ + GF_ASSERT((lock->refs_owners > 0) && !list_empty(&link->owner_list)); + list_del_init(&link->owner_list); + + lock->release |= release; + + if ((fop->error == 0) && (cbk != NULL) && (cbk->op_ret >= 0)) { + if (link->update[0]) { + ctx->post_version[0]++; + } + if (link->update[1]) { + ctx->post_version[1]++; + } + /* If the fop fails on any of the good bricks, it is important to mark + * it dirty and update versions right away. */ + if (link->update[0] || link->update[1]) { + if (lock->good_mask & ~(fop->good | fop->remaining)) { + lock->release = _gf_true; + } + } + } + + if (fop->healing) { + lock->healing = fop->healing & (fop->good | fop->remaining); + } + ec_lock_update_good(lock, fop); + + ec_lock_wake_shared(lock, &list); + + UNLOCK(&lock->loc.inode->lock); + + ec_lock_resume_shared(&list); +} + +void +ec_lock(ec_fop_data_t *fop) +{ + ec_lock_link_t *link; + + /* There is a chance that ec_resume is called on fop even before ec_sleep. + * Which can result in refs == 0 for fop leading to use after free in this + * function when it calls ec_sleep so do ec_sleep at start and ec_resume at + * the end of this function.*/ + ec_sleep(fop); + + while (fop->locked < fop->lock_count) { + /* Since there are only up to 2 locks per fop, this xor will change + * the order of the locks if fop->first_lock is 1. */ + link = &fop->locks[fop->locked ^ fop->first_lock]; + + if (!ec_lock_assign_owner(link) || !ec_lock_acquire(link)) { + break; + } + } + + ec_resume(fop, 0); +} + +void +ec_lock_unfreeze(ec_lock_link_t *link) +{ + struct list_head list; + ec_lock_t *lock; + gf_boolean_t destroy = _gf_false; + + lock = link->lock; + + INIT_LIST_HEAD(&list); + + LOCK(&lock->loc.inode->lock); + + /* The lock must be marked to be released here, since we have just released + * it and any attempt to assign it to more fops must have added them to the + * frozen list. We can only have one active reference here: the one that + * is processing this unfreeze. */ + GF_ASSERT(lock->release && (lock->refs_owners == 1)); + lock->release = _gf_false; + lock->refs_owners = 0; + + lock->acquired = _gf_false; + + /* We are unfreezing a lock. This means that the lock has already been + * released. In this state it shouldn't have a pending timer nor have any + * owner, and the waiting list should be empty. Only the frozen list can + * contain some fop. */ + GF_ASSERT((lock->timer == NULL) && list_empty(&lock->waiting) && + list_empty(&lock->owners)); + + /* We move all frozen fops to the waiting list. */ + list_splice_init(&lock->frozen, &lock->waiting); + + /* If we don't have any fop waiting nor there are any prepared fops using + * this lock, we can finally dispose it. */ + destroy = list_empty(&lock->waiting) && (lock->refs_pending == 0); + if (destroy) { + ec_trace("LOCK_DESTROY", link->fop, "lock=%p", lock); + + lock->ctx->inode_lock = NULL; + } else { + ec_trace("LOCK_UNFREEZE", link->fop, "lock=%p", lock); + + ec_lock_wake_shared(lock, &list); + } + + UNLOCK(&lock->loc.inode->lock); + + ec_lock_resume_shared(&list); + + if (destroy) { + ec_lock_destroy(lock); + } +} + +int32_t +ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + ec_lock_link_t *link = fop->data; + + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_UNLOCK_FAILED, + "entry/inode unlocking failed :(%s)", ec_msg_str(link->fop)); + } else { + ec_trace("UNLOCKED", link->fop, "lock=%p", link->lock); + } + + ec_lock_unfreeze(link); + + return 0; +} + +void +ec_unlock_lock(ec_lock_link_t *link) +{ + ec_lock_t *lock; + ec_fop_data_t *fop; + gf_lkowner_t lk_owner; + + lock = link->lock; + fop = link->fop; + + lock->unlock_now = _gf_false; + ec_clear_inode_info(fop, lock->loc.inode); + + if ((lock->mask != 0) && lock->acquired) { + set_lk_owner_from_ptr(&lk_owner, lock); + lock->flock.l_type = F_UNLCK; + ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock, + lock->loc.inode); + + ec_inodelk(fop->frame, fop->xl, &lk_owner, lock->mask, EC_MINIMUM_ONE, + ec_unlocked, link, fop->xl->name, &lock->loc, F_SETLK, + &lock->flock, NULL); + } else { + ec_lock_unfreeze(link); + } +} + +void +ec_inode_bad_inc(inode_t *inode, xlator_t *xl) +{ + ec_inode_t *ctx = NULL; + + LOCK(&inode->lock); + { + ctx = __ec_inode_get(inode, xl); + if (ctx == NULL) { + goto unlock; + } + ctx->bad_version++; + } +unlock: + UNLOCK(&inode->lock); +} + +int32_t +ec_update_size_version_done(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + ec_lock_link_t *link; + ec_lock_t *lock; + ec_inode_t *ctx; + + link = fop->data; + lock = link->lock; + ctx = lock->ctx; + + if (op_ret < 0) { + if (link->lock->fd == NULL) { + ec_inode_bad_inc(link->lock->loc.inode, this); + } else { + ec_inode_bad_inc(link->lock->fd->inode, this); + } + + gf_msg(fop->xl->name, fop_log_level(fop->id, op_errno), op_errno, + EC_MSG_SIZE_VERS_UPDATE_FAIL, + "Failed to update version and size. %s", ec_msg_str(fop)); + } else { + fop->parent->good &= fop->good; + + ec_lock_update_good(lock, fop); + + if (ec_dict_del_array(xattr, EC_XATTR_VERSION, ctx->post_version, + EC_VERSION_SIZE) == 0) { + ctx->pre_version[0] = ctx->post_version[0]; + ctx->pre_version[1] = ctx->post_version[1]; + + ctx->have_version = _gf_true; + } + if (ec_dict_del_number(xattr, EC_XATTR_SIZE, &ctx->post_size) == 0) { + ctx->pre_size = ctx->post_size; + + ctx->have_size = _gf_true; + } + if ((ec_dict_del_config(xdata, EC_XATTR_CONFIG, &ctx->config) == 0) && + ec_config_check(fop->xl, &ctx->config)) { + ctx->have_config = _gf_true; + } + + ctx->have_info = _gf_true; + } + /* If we are here because of fop's and other than unlock request, + * that means we are still holding a lock. That make sure + * lock->unlock_now can not be modified. + */ + if (lock->unlock_now) { + ec_unlock_lock(fop->data); + } + + return 0; +} + +void +ec_update_size_version(ec_lock_link_t *link, uint64_t *version, uint64_t size, + uint64_t *dirty) +{ + ec_fop_data_t *fop; + ec_lock_t *lock; + ec_inode_t *ctx; + dict_t *dict = NULL; + uintptr_t update_on = 0; + int32_t err = -ENOMEM; + + fop = link->fop; + lock = link->lock; + ctx = lock->ctx; + + ec_trace("UPDATE", fop, "version=%ld/%ld, size=%ld, dirty=%ld/%ld", + version[0], version[1], size, dirty[0], dirty[1]); + + dict = dict_new(); + if (dict == NULL) { + goto out; + } + + /* If we don't have version information or it has been modified, we + * update it. */ + if (!ctx->have_version || (version[0] != 0) || (version[1] != 0)) { + err = ec_dict_set_array(dict, EC_XATTR_VERSION, version, + EC_VERSION_SIZE); + if (err != 0) { + goto out; + } + } + + if (size != 0) { + /* If size has been changed, we should already + * know the previous size of the file. */ + GF_ASSERT(ctx->have_size); + + err = ec_dict_set_number(dict, EC_XATTR_SIZE, size); + if (err != 0) { + goto out; + } + } + + if (dirty[0] || dirty[1]) { + err = ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE); + if (err != 0) { + goto out; + } + } + + /* If config information is not known, we request it now. */ + if ((lock->loc.inode->ia_type == IA_IFREG) && !ctx->have_config) { + /* A failure requesting this xattr is ignored because it's not + * absolutely required right now. */ + (void)ec_dict_set_number(dict, EC_XATTR_CONFIG, 0); + } + + fop->frame->root->uid = 0; + fop->frame->root->gid = 0; + + update_on = lock->good_mask | lock->healing; + + if (link->lock->fd == NULL) { + ec_xattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN, + ec_update_size_version_done, link, &link->lock->loc, + GF_XATTROP_ADD_ARRAY64, dict, NULL); + } else { + ec_fxattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN, + ec_update_size_version_done, link, link->lock->fd, + GF_XATTROP_ADD_ARRAY64, dict, NULL); + } + + fop->frame->root->uid = fop->uid; + fop->frame->root->gid = fop->gid; + + dict_unref(dict); + + return; + +out: + if (dict != NULL) { + dict_unref(dict); + } + + ec_fop_set_error(fop, -err); + + gf_msg(fop->xl->name, GF_LOG_ERROR, -err, EC_MSG_SIZE_VERS_UPDATE_FAIL, + "Unable to update version and size. %s", ec_msg_str(fop)); + + if (lock->unlock_now) { + ec_unlock_lock(fop->data); + } +} + +gf_boolean_t +ec_update_info(ec_lock_link_t *link) +{ + ec_lock_t *lock; + ec_inode_t *ctx; + uint64_t version[2] = {0, 0}; + uint64_t dirty[2] = {0, 0}; + uint64_t size; + ec_t *ec = NULL; + uintptr_t mask; + + lock = link->lock; + ctx = lock->ctx; + ec = link->fop->xl->private; + + /* pre_version[*] will be 0 if have_version is false */ + version[EC_DATA_TXN] = ctx->post_version[EC_DATA_TXN] - + ctx->pre_version[EC_DATA_TXN]; + version[EC_METADATA_TXN] = ctx->post_version[EC_METADATA_TXN] - + ctx->pre_version[EC_METADATA_TXN]; + + size = ctx->post_size - ctx->pre_size; + /* If we set the dirty flag for update fop, we have to unset it. + * If fop has failed on some bricks, leave the dirty as marked. */ + + if (lock->unlock_now) { + if (version[EC_DATA_TXN]) { + /*A data fop will have difference in post and pre version + *and for data fop we send writes on healing bricks also */ + mask = lock->good_mask | lock->healing; + } else { + mask = lock->good_mask; + } + /* Ensure that nodes are up while doing final + * metadata update.*/ + if (!(ec->node_mask & ~(mask)) && !(ec->node_mask & ~ec->xl_up)) { + if (ctx->dirty[EC_DATA_TXN] != 0) { + dirty[EC_DATA_TXN] = -1; + } + if (ctx->dirty[EC_METADATA_TXN] != 0) { + dirty[EC_METADATA_TXN] = -1; + } + /*If everything is fine and we already + *have version xattr set on entry, there + *is no need to update version again*/ + if (ctx->pre_version[EC_DATA_TXN]) { + version[EC_DATA_TXN] = 0; + } + if (ctx->pre_version[EC_METADATA_TXN]) { + version[EC_METADATA_TXN] = 0; + } + } else { + link->optimistic_changelog = _gf_false; + ec_set_dirty_flag(link, ctx, dirty); + } + memset(ctx->dirty, 0, sizeof(ctx->dirty)); + } + + if ((version[EC_DATA_TXN] != 0) || (version[EC_METADATA_TXN] != 0) || + (dirty[EC_DATA_TXN] != 0) || (dirty[EC_METADATA_TXN] != 0)) { + ec_update_size_version(link, version, size, dirty); + return _gf_true; + } + + return _gf_false; +} + +void +ec_unlock_now(ec_lock_link_t *link) +{ + ec_lock_t *lock; + lock = link->lock; + + ec_trace("UNLOCK_NOW", link->fop, "lock=%p", link->lock); + /*At this point, lock is not being used by any fop and + *can not be reused by any fop as it is going to be released. + *lock->unlock_now can not be modified at any other place. + */ + lock->unlock_now = _gf_true; + + if (!ec_update_info(link)) { + ec_unlock_lock(link); + } + + ec_resume(link->fop, 0); +} + +void +ec_lock_release(ec_t *ec, inode_t *inode) +{ + ec_lock_t *lock; + ec_inode_t *ctx; + ec_lock_link_t *timer_link = NULL; + + LOCK(&inode->lock); + + ctx = __ec_inode_get(inode, ec->xl); + if (ctx == NULL) { + goto done; + } + lock = ctx->inode_lock; + if ((lock == NULL) || lock->release) { + goto done; + } + + gf_msg_debug(ec->xl->name, 0, "Releasing inode %p due to lock contention", + inode); + + if (!lock->acquired) { + /* This happens if some bricks already got the lock while inodelk is in + * progress. Set release to true after lock is acquired*/ + lock->contention = _gf_true; + goto done; + } + + /* The lock is not marked to be released, so the frozen list should be + * empty. */ + GF_ASSERT(list_empty(&lock->frozen)); + + timer_link = ec_lock_timer_cancel(ec->xl, lock); + + /* We mark the lock to be released as soon as possible. */ + lock->release = _gf_true; + +done: + UNLOCK(&inode->lock); + + /* If we have cancelled the timer, we need to start the unlock of the + * inode. If there was a timer but we have been unable to cancel it + * because it was just triggered, the timer callback will take care + * of releasing the inode. */ + if (timer_link != NULL) { + ec_unlock_now(timer_link); + } +} + +void +ec_unlock_timer_add(ec_lock_link_t *link); + +void +ec_unlock_timer_del(ec_lock_link_t *link) +{ + ec_lock_t *lock; + inode_t *inode; + gf_boolean_t now = _gf_false; + + /* If we are here, it means that the timer has expired before having + * been cancelled. This guarantees that 'link' is still valid because + * the fop that contains it must be pending (if timer cancellation in + * ec_lock_assign_owner() fails, the fop is left sleeping). + * + * At the same time, the fop still has a reference to the lock, so + * it must also be valid. + */ + lock = link->lock; + + /* 'lock' must have a valid inode since it can only be destroyed + * when the lock itself is destroyed, but we have a reference to the + * lock to avoid this. + */ + inode = lock->loc.inode; + + LOCK(&inode->lock); + + if (lock->timer != NULL) { + ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock); + + /* The unlock timer has expired without anyone cancelling it. + * This means that it shouldn't have any owner, and the waiting + * and frozen lists should be empty. It must have only one + * owner reference, but there can be fops being prepared + * though. + * */ + GF_ASSERT(!lock->release && (lock->refs_owners == 1) && + list_empty(&lock->owners) && list_empty(&lock->waiting) && + list_empty(&lock->frozen)); + + gf_timer_call_cancel(link->fop->xl->ctx, lock->timer); + lock->timer = NULL; + + /* Any fop being processed from now on, will need to wait + * until the next unlock/lock cycle. */ + lock->release = now = _gf_true; + } + + UNLOCK(&inode->lock); + + if (now) { + ec_unlock_now(link); + } else { + /* The timer has been cancelled just after firing it but before + * getting here. This means that another fop has used the lock + * and everything should be handled as if this callback were + * have not been executed. However we still have an owner + * reference. + * + * We need to release our reference. If this is not the last + * reference (the most common case because another fop has + * taken another ref) we only need to decrement the counter. + * Otherwise we have been delayed enough so that the other fop + * has had time to acquire the reference, do its operation and + * release it. At the time of releasing it, the fop did found + * that the ref counter was > 1 (our reference), so the delayed + * unlock timer wasn't started. We need to start it again if we + * are the last reference. + * + * ec_unlock_timer_add() handles both cases. + */ + ec_unlock_timer_add(link); + + /* We need to resume the fop that was waiting for the delayed + * unlock. + */ + ec_resume(link->fop, 0); + } +} + +void +ec_unlock_timer_cbk(void *data) +{ + ec_unlock_timer_del(data); +} + +static gf_boolean_t +ec_eager_lock_used(ec_t *ec, ec_fop_data_t *fop) +{ + /* Fops with no locks at this point mean that they are sent as sub-fops + * of other higher level fops. In this case we simply assume that the + * parent fop will take correct care of the eager lock. */ + if (fop->lock_count == 0) { + return _gf_true; + } + + /* We may have more than one lock, but this only happens in the rename + * fop, and both locks will reference an inode of the same type (a + * directory in this case), so we only need to check the first lock. */ + if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) { + return ec->eager_lock; + } + + return ec->other_eager_lock; +} + +static uint32_t +ec_eager_lock_timeout(ec_t *ec, ec_lock_t *lock) +{ + if (lock->loc.inode->ia_type == IA_IFREG) { + return ec->eager_lock_timeout; + } + + return ec->other_eager_lock_timeout; +} + +static gf_boolean_t +ec_lock_delay_create(ec_lock_link_t *link) +{ + struct timespec delay; + ec_fop_data_t *fop = link->fop; + ec_lock_t *lock = link->lock; + + delay.tv_sec = ec_eager_lock_timeout(fop->xl->private, lock); + delay.tv_nsec = 0; + lock->timer = gf_timer_call_after(fop->xl->ctx, delay, ec_unlock_timer_cbk, + link); + if (lock->timer == NULL) { + gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM, + EC_MSG_UNLOCK_DELAY_FAILED, "Unable to delay an unlock"); + + return _gf_false; + } + + return _gf_true; +} + +void +ec_unlock_timer_add(ec_lock_link_t *link) +{ + ec_fop_data_t *fop = link->fop; + ec_lock_t *lock = link->lock; + gf_boolean_t now = _gf_false; + + LOCK(&lock->loc.inode->lock); + + /* We are trying to unlock the lock. We can have multiple scenarios here, + * but all of them need to have lock->timer == NULL: + * + * 1. There are other owners currently running that can call ec_unlock(). + * + * None of them can have started the timer until the last one. But this + * call should be the consequence of this lastest one. + * + * 2. There are fops in the waiting or frozen lists. + * + * These fops cannot call ec_unlock(). So we should be here. + * + * We must reach here with at least one owner reference. + */ + GF_ASSERT((lock->timer == NULL) && (lock->refs_owners > 0)); + + /* If the fop detects that a heal is needed, we mark the lock to be + * released as soon as possible. */ + lock->release |= ec_fop_needs_heal(fop); + + if (lock->refs_owners > 1) { + ec_trace("UNLOCK_SKIP", fop, "lock=%p", lock); + + /* If there are other owners we cannot do anything else with the lock. + * Note that the current fop has already been removed from the owners + * list in ec_lock_reuse(). */ + lock->refs_owners--; + + UNLOCK(&lock->loc.inode->lock); + } else if (lock->acquired) { + /* There are no other owners and the lock is acquired. If there were + * fops waiting, at least one of them should have been promoted to an + * owner, so the waiting list should be empty. */ + GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting)); + + ec_t *ec = fop->xl->private; + + /* If everything goes as expected this fop will be put to sleep until + * the timer callback is executed. */ + ec_sleep(fop); + + /* If the lock needs to be released, or ec is shutting down, do not + * delay lock release. */ + if (!lock->release && !ec->shutdown) { + ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock, + lock->release); + + if (!ec_lock_delay_create(link)) { + /* We are unable to create a new timer. We immediately release + * the lock. */ + lock->release = now = _gf_true; + } + + } else { + ec_trace("UNLOCK_FORCE", fop, "lock=%p, release=%d", lock, + lock->release); + lock->release = now = _gf_true; + } + + UNLOCK(&lock->loc.inode->lock); + + if (now) { + ec_unlock_now(link); + } + } else { + /* There are no owners and the lock is not acquired. This can only + * happen if a lock attempt has failed and we get to the unlock step + * of the fop. As in the previous case, the waiting list must be + * empty. */ + GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting)); + + /* We need to mark the lock to be released to correctly handle fops + * that may get in after we release the inode mutex but before + * ec_lock_unfreeze() is processed. */ + lock->release = _gf_true; + + UNLOCK(&lock->loc.inode->lock); + + ec_lock_unfreeze(link); + } +} + +void +ec_unlock(ec_fop_data_t *fop) +{ + int32_t i; + + for (i = 0; i < fop->lock_count; i++) { + ec_unlock_timer_add(&fop->locks[i]); + } +} + +void +ec_flush_size_version(ec_fop_data_t *fop) +{ + GF_ASSERT(fop->lock_count == 1); + ec_update_info(&fop->locks[0]); +} + +static void +ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe, + ec_fop_data_t *fop) +{ + off_t base; + + /* On write fops, we only update existing fragments if the write has + * succeeded. Otherwise, we remove them from the cache. */ + if ((fop->id == GF_FOP_WRITE) && (fop->answer != NULL) && + (fop->answer->op_ret >= 0)) { + base = stripe->frag_offset - fop->frag_range.first; + base *= ec->fragments; + + /* We check if the stripe offset falls inside the real region + * modified by the write fop (a write request is allowed, + * though uncommon, to write less bytes than requested). The + * current write fop implementation doesn't allow partial + * writes of fragments, so if there's no error, we are sure + * that a full stripe has been completely modified or not + * touched at all. The value of op_ret may not be a multiple + * of the stripe size because it depends on the requested + * size by the user, so we update the stripe if the write has + * modified at least one byte (meaning ec has written the full + * stripe). */ + if (base < fop->answer->op_ret + fop->head) { + memcpy(stripe->data, fop->vector[0].iov_base + base, + ec->stripe_size); + list_move_tail(&stripe->lru, &stripe_cache->lru); + + GF_ATOMIC_INC(ec->stats.stripe_cache.updates); + } + } else { + stripe->frag_offset = -1; + list_move(&stripe->lru, &stripe_cache->lru); + + GF_ATOMIC_INC(ec->stats.stripe_cache.invals); + } +} + +static void +ec_update_cached_stripes(ec_fop_data_t *fop) +{ + uint64_t first; + uint64_t last; + ec_stripe_t *stripe = NULL; + ec_inode_t *ctx = NULL; + ec_stripe_list_t *stripe_cache = NULL; + inode_t *inode = NULL; + struct list_head *temp; + struct list_head sentinel; + + first = fop->frag_range.first; + /* 'last' represents the first stripe not touched by the operation */ + last = fop->frag_range.last; + + /* If there are no modified stripes, we don't need to do anything + * else. */ + if (last <= first) { + return; + } + + if (!fop->use_fd) { + inode = fop->loc[0].inode; + } else { + inode = fop->fd->inode; + } + + LOCK(&inode->lock); + + ctx = __ec_inode_get(inode, fop->xl); + if (ctx == NULL) { + goto out; + } + stripe_cache = &ctx->stripe_cache; + + /* Since we'll be moving elements of the list to the tail, we might + * end in an infinite loop. To avoid it, we insert a sentinel element + * into the list, so that it will be used to detect when we have + * traversed all existing elements once. */ + list_add_tail(&sentinel, &stripe_cache->lru); + temp = stripe_cache->lru.next; + while (temp != &sentinel) { + stripe = list_entry(temp, ec_stripe_t, lru); + temp = temp->next; + if ((first <= stripe->frag_offset) && (stripe->frag_offset < last)) { + ec_update_stripe(fop->xl->private, stripe_cache, stripe, fop); + } + } + list_del(&sentinel); + +out: + UNLOCK(&inode->lock); +} + +void +ec_lock_reuse(ec_fop_data_t *fop) +{ + ec_cbk_data_t *cbk; + ec_t *ec = NULL; + int32_t i, count; + gf_boolean_t release = _gf_false; + ec = fop->xl->private; + cbk = fop->answer; + + if (ec_eager_lock_used(ec, fop) && cbk != NULL) { + if (cbk->xdata != NULL) { + if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT, &count) == + 0) && + (count > 1)) { + release = _gf_true; + } + if (release) { + gf_msg_debug(fop->xl->name, 0, "Lock contention detected"); + } + } + } else { + /* If eager lock is disabled or if we haven't get + * an answer with enough quorum, we always release + * the lock. */ + release = _gf_true; + } + ec_update_cached_stripes(fop); + + for (i = 0; i < fop->lock_count; i++) { + ec_lock_next_owner(&fop->locks[i], cbk, release); + } +} + +void +__ec_manager(ec_fop_data_t *fop, int32_t error) +{ + ec_t *ec = fop->xl->private; + + do { + ec_trace("MANAGER", fop, "error=%d", error); + + if (!ec_must_wind(fop)) { + if (ec->xl_up_count < ec->fragments) { + error = ENOTCONN; + } + } + + if (error != 0) { + fop->error = error; + fop->state = -fop->state; + } + + if ((fop->state == EC_STATE_END) || (fop->state == -EC_STATE_END)) { + ec_fop_data_release(fop); + + break; + } + + /* At each state, fop must not be used anywhere else and there + * shouldn't be any pending subfop going on. */ + GF_ASSERT(fop->jobs == 0); + + /* While the manager is running we need to avoid that subfops launched + * from it could finish and call ec_resume() before the fop->handler + * has completed. This could lead to the same manager being executed + * by two threads concurrently. ec_check_complete() will take care of + * this reference. */ + fop->jobs = 1; + + fop->state = fop->handler(fop, fop->state); + GF_ASSERT(fop->state >= 0); + + error = ec_check_complete(fop, __ec_manager); + } while (error >= 0); +} + +void +ec_manager(ec_fop_data_t *fop, int32_t error) +{ + GF_ASSERT(fop->jobs == 0); + GF_ASSERT(fop->winds == 0); + GF_ASSERT(fop->error == 0); + + if (fop->state == EC_STATE_START) { + fop->state = EC_STATE_INIT; + } + + __ec_manager(fop, error); +} + +gf_boolean_t +__ec_is_last_fop(ec_t *ec) +{ + if ((list_empty(&ec->pending_fops)) && + (GF_ATOMIC_GET(ec->async_fop_count) == 0)) { + return _gf_true; + } + return _gf_false; +} diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h new file mode 100644 index 00000000000..51493612ac6 --- /dev/null +++ b/xlators/cluster/ec/src/ec-common.h @@ -0,0 +1,234 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_COMMON_H__ +#define __EC_COMMON_H__ + +#include "glusterfs/compat-errno.h" // for ENODATA on BSD +#include "ec-data.h" + +typedef enum { EC_DATA_TXN, EC_METADATA_TXN } ec_txn_t; + +#define EC_FOP_HEAL -1 +#define EC_FOP_FHEAL -2 + +#define EC_CONFIG_VERSION 0 + +#define EC_CONFIG_ALGORITHM 0 + +#define EC_FLAG_LOCK_SHARED 0x0001 + +#define QUORUM_CBK(fn, fop, frame, cookie, this, op_ret, op_errno, params...) \ + do { \ + ec_t *__ec = fop->xl->private; \ + int32_t __op_ret = 0; \ + int32_t __op_errno = 0; \ + int32_t __success_count = gf_bits_count(fop->good); \ + \ + __op_ret = op_ret; \ + __op_errno = op_errno; \ + if (!fop->parent && frame && \ + (GF_CLIENT_PID_SELF_HEALD != frame->root->pid) && \ + __ec->quorum_count && (__success_count < __ec->quorum_count) && \ + op_ret >= 0) { \ + __op_ret = -1; \ + __op_errno = EIO; \ + gf_msg(__ec->xl->name, GF_LOG_ERROR, 0, \ + EC_MSG_CHILDS_INSUFFICIENT, \ + "Insufficient available children for this request " \ + "(have %d, need %d). %s", \ + __success_count, __ec->quorum_count, ec_msg_str(fop)); \ + } \ + fn(frame, cookie, this, __op_ret, __op_errno, params); \ + } while (0) + +enum _ec_xattrop_flags { + EC_FLAG_XATTROP, + EC_FLAG_DATA_DIRTY, + EC_FLAG_METADATA_DIRTY, + + /* Add any new flag here, before EC_FLAG_MAX. The maximum number of + * flags that can be defined is 16. */ + + EC_FLAG_MAX +}; + +/* We keep two sets of flags. One to determine what's really providing the + * current xattrop and the other to know what the parent fop of the xattrop + * needs to proceed. It might happen that a fop needs some information that + * is being already requested by a previous fop. The two sets are stored + * contiguously. */ + +#define EC_FLAG_NEEDS(_flag) (1 << (_flag)) +#define EC_FLAG_PROVIDES(_flag) (1 << ((_flag) + EC_FLAG_MAX)) + +#define EC_NEEDED_FLAGS(_flags) ((_flags) & ((1 << EC_FLAG_MAX) - 1)) + +#define EC_PROVIDED_FLAGS(_flags) EC_NEEDED_FLAGS((_flags) >> EC_FLAG_MAX) + +#define EC_FLAGS_HAVE(_flags, _flag) (((_flags) & (1 << (_flag))) != 0) + +#define EC_SELFHEAL_BIT 62 + +#define EC_MINIMUM_ONE (1 << 6) +#define EC_MINIMUM_MIN (2 << 6) +#define EC_MINIMUM_ALL (3 << 6) +#define EC_FOP_NO_PROPAGATE_ERROR (1 << 8) +#define EC_FOP_MINIMUM(_flags) ((_flags)&255) +#define EC_FOP_FLAGS(_flags) ((_flags) & ~255) + +#define EC_UPDATE_DATA 1 +#define EC_UPDATE_META 2 +#define EC_QUERY_INFO 4 +#define EC_INODE_SIZE 8 + +#define EC_STATE_START 0 +#define EC_STATE_END 0 +#define EC_STATE_INIT 1 +#define EC_STATE_LOCK 2 +#define EC_STATE_DISPATCH 3 +#define EC_STATE_PREPARE_ANSWER 4 +#define EC_STATE_REPORT 5 +#define EC_STATE_LOCK_REUSE 6 +#define EC_STATE_UNLOCK 7 + +#define EC_STATE_DELAYED_START 100 + +#define EC_STATE_HEAL_ENTRY_LOOKUP 200 +#define EC_STATE_HEAL_ENTRY_PREPARE 201 +#define EC_STATE_HEAL_PRE_INODELK_LOCK 202 +#define EC_STATE_HEAL_PRE_INODE_LOOKUP 203 +#define EC_STATE_HEAL_XATTRIBUTES_REMOVE 204 +#define EC_STATE_HEAL_XATTRIBUTES_SET 205 +#define EC_STATE_HEAL_ATTRIBUTES 206 +#define EC_STATE_HEAL_OPEN 207 +#define EC_STATE_HEAL_REOPEN_FD 208 +#define EC_STATE_HEAL_UNLOCK 209 +#define EC_STATE_HEAL_UNLOCK_ENTRY 210 +#define EC_STATE_HEAL_DATA_LOCK 211 +#define EC_STATE_HEAL_DATA_COPY 212 +#define EC_STATE_HEAL_DATA_UNLOCK 213 +#define EC_STATE_HEAL_POST_INODELK_LOCK 214 +#define EC_STATE_HEAL_POST_INODE_LOOKUP 215 +#define EC_STATE_HEAL_SETATTR 216 +#define EC_STATE_HEAL_POST_INODELK_UNLOCK 217 +#define EC_STATE_HEAL_DISPATCH 218 + +/* Value to cover the full range of a file */ +#define EC_RANGE_FULL ((uint64_t)LLONG_MAX + 1) + +gf_boolean_t +ec_dispatch_one_retry(ec_fop_data_t *fop, ec_cbk_data_t **cbk); +void +ec_dispatch_next(ec_fop_data_t *fop, uint32_t idx); + +void +ec_complete(ec_fop_data_t *fop); + +void +ec_update_good(ec_fop_data_t *fop, uintptr_t good); + +void +ec_fop_set_error(ec_fop_data_t *fop, int32_t error); + +void +__ec_fop_set_error(ec_fop_data_t *fop, int32_t error); + +ec_cbk_data_t * +ec_fop_prepare_answer(ec_fop_data_t *fop, gf_boolean_t ro); + +gf_boolean_t +ec_cbk_set_error(ec_cbk_data_t *cbk, int32_t error, gf_boolean_t ro); + +void +ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags, + off_t fl_start, uint64_t fl_size); +void +ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base, + uint32_t flags); +void +ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags, off_t fl_start, + uint64_t fl_size); +void +ec_lock(ec_fop_data_t *fop); +void +ec_lock_reuse(ec_fop_data_t *fop); +void +ec_unlock(ec_fop_data_t *fop); +void +ec_lock_release(ec_t *ec, inode_t *inode); + +gf_boolean_t +ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size); +gf_boolean_t +__ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size); +gf_boolean_t +ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t size); +gf_boolean_t +__ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t size); +void +ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode); + +void +ec_flush_size_version(ec_fop_data_t *fop); + +void +ec_dispatch_all(ec_fop_data_t *fop); +void +ec_dispatch_inc(ec_fop_data_t *fop); +void +ec_dispatch_min(ec_fop_data_t *fop); +void +ec_dispatch_one(ec_fop_data_t *fop); + +void +ec_succeed_all(ec_fop_data_t *fop); + +void +ec_sleep(ec_fop_data_t *fop); +void +ec_resume(ec_fop_data_t *fop, int32_t error); +void +ec_resume_parent(ec_fop_data_t *fop); + +void +ec_manager(ec_fop_data_t *fop, int32_t error); +gf_boolean_t +ec_is_recoverable_error(int32_t op_errno); +void +ec_handle_healers_done(ec_fop_data_t *fop); + +int32_t +ec_heal_inspect(call_frame_t *frame, ec_t *ec, inode_t *inode, + unsigned char *locked_on, gf_boolean_t self_locked, + gf_boolean_t thorough, ec_heal_need_t *need_heal); +int32_t +ec_get_heal_info(xlator_t *this, loc_t *loc, dict_t **dict); + +int32_t +ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +void +ec_update_fd_status(fd_t *fd, xlator_t *xl, int child_index, + int32_t ret_status); +gf_boolean_t +ec_is_entry_healing(ec_fop_data_t *fop); +void +ec_set_entry_healing(ec_fop_data_t *fop); +void +ec_reset_entry_healing(ec_fop_data_t *fop); +char * +ec_msg_str(ec_fop_data_t *fop); +gf_boolean_t +__ec_is_last_fop(ec_t *ec); +void +ec_lock_update_good(ec_lock_t *lock, ec_fop_data_t *fop); +#endif /* __EC_COMMON_H__ */ diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c new file mode 100644 index 00000000000..06388833546 --- /dev/null +++ b/xlators/cluster/ec/src/ec-data.c @@ -0,0 +1,288 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "ec-helpers.h" +#include "ec-common.h" +#include "ec-data.h" +#include "ec-messages.h" + +ec_cbk_data_t * +ec_cbk_data_allocate(call_frame_t *frame, xlator_t *this, ec_fop_data_t *fop, + int32_t id, int32_t idx, int32_t op_ret, int32_t op_errno) +{ + ec_cbk_data_t *cbk; + ec_t *ec = this->private; + + if (fop->xl != this) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_XLATOR_MISMATCH, + "Mismatching xlators between request " + "and answer (req=%s, ans=%s).", + fop->xl->name, this->name); + + return NULL; + } + if (fop->frame != frame) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_FRAME_MISMATCH, + "Mismatching frames between request " + "and answer (req=%p, ans=%p).", + fop->frame, frame); + + return NULL; + } + if (fop->id != id) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_FOP_MISMATCH, + "Mismatching fops between request " + "and answer (req=%d, ans=%d).", + fop->id, id); + + return NULL; + } + + cbk = mem_get0(ec->cbk_pool); + if (cbk == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to allocate memory for an " + "answer."); + return NULL; + } + + cbk->fop = fop; + cbk->idx = idx; + cbk->mask = 1ULL << idx; + cbk->count = 1; + cbk->op_ret = op_ret; + cbk->op_errno = op_errno; + INIT_LIST_HEAD(&cbk->entries.list); + + LOCK(&fop->lock); + + list_add_tail(&cbk->answer_list, &fop->answer_list); + + UNLOCK(&fop->lock); + + return cbk; +} + +void +ec_cbk_data_destroy(ec_cbk_data_t *cbk) +{ + if (cbk->xdata != NULL) { + dict_unref(cbk->xdata); + } + if (cbk->dict != NULL) { + dict_unref(cbk->dict); + } + if (cbk->inode != NULL) { + inode_unref(cbk->inode); + } + if (cbk->fd != NULL) { + fd_unref(cbk->fd); + } + if (cbk->buffers != NULL) { + iobref_unref(cbk->buffers); + } + GF_FREE(cbk->vector); + gf_dirent_free(&cbk->entries); + GF_FREE(cbk->str); + + mem_put(cbk); +} + +ec_fop_data_t * +ec_fop_data_allocate(call_frame_t *frame, xlator_t *this, int32_t id, + uint32_t flags, uintptr_t target, uint32_t fop_flags, + ec_wind_f wind, ec_handler_f handler, ec_cbk_t cbks, + void *data) +{ + ec_fop_data_t *fop, *parent; + ec_t *ec = this->private; + + fop = mem_get0(ec->fop_pool); + if (fop == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to allocate memory for a " + "request."); + + return NULL; + } + + INIT_LIST_HEAD(&fop->cbk_list); + INIT_LIST_HEAD(&fop->healer); + INIT_LIST_HEAD(&fop->answer_list); + INIT_LIST_HEAD(&fop->pending_list); + INIT_LIST_HEAD(&fop->locks[0].owner_list); + INIT_LIST_HEAD(&fop->locks[0].wait_list); + INIT_LIST_HEAD(&fop->locks[1].owner_list); + INIT_LIST_HEAD(&fop->locks[1].wait_list); + + fop->xl = this; + fop->req_frame = frame; + + /* fops need a private frame to be able to execute some postop operations + * even if the original fop has completed and reported back to the upper + * xlator and it has destroyed the base frame. + * + * TODO: minimize usage of private frames. Reuse req_frame as much as + * possible. + */ + if (frame != NULL) { + fop->frame = copy_frame(frame); + } else { + fop->frame = create_frame(this, this->ctx->pool); + } + if (fop->frame == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to create a private frame " + "for a request"); + + mem_put(fop); + + return NULL; + } + fop->id = id; + fop->refs = 1; + + fop->flags = flags; + fop->minimum = EC_FOP_MINIMUM(fop_flags); + fop->fop_flags = EC_FOP_FLAGS(fop_flags); + fop->mask = target; + + fop->wind = wind; + fop->handler = handler; + fop->cbks = cbks; + fop->data = data; + + fop->uid = fop->frame->root->uid; + fop->gid = fop->frame->root->gid; + + LOCK_INIT(&fop->lock); + + fop->frame->local = fop; + + if (frame != NULL) { + parent = frame->local; + if (parent != NULL) { + ec_sleep(parent); + } + + fop->parent = parent; + } + + LOCK(&ec->lock); + + list_add_tail(&fop->pending_list, &ec->pending_fops); + + UNLOCK(&ec->lock); + + return fop; +} + +void +ec_fop_data_acquire(ec_fop_data_t *fop) +{ + LOCK(&fop->lock); + + ec_trace("ACQUIRE", fop, ""); + + fop->refs++; + + UNLOCK(&fop->lock); +} + +static void +ec_handle_last_pending_fop_completion(ec_fop_data_t *fop, gf_boolean_t *notify) +{ + ec_t *ec = fop->xl->private; + + *notify = _gf_false; + + if (!list_empty(&fop->pending_list)) { + LOCK(&ec->lock); + { + list_del_init(&fop->pending_list); + *notify = __ec_is_last_fop(ec); + } + UNLOCK(&ec->lock); + } +} + +void +ec_fop_cleanup(ec_fop_data_t *fop) +{ + ec_cbk_data_t *cbk, *tmp; + + list_for_each_entry_safe(cbk, tmp, &fop->answer_list, answer_list) + { + list_del_init(&cbk->answer_list); + + ec_cbk_data_destroy(cbk); + } + INIT_LIST_HEAD(&fop->cbk_list); + + fop->answer = NULL; +} + +void +ec_fop_data_release(ec_fop_data_t *fop) +{ + ec_t *ec = NULL; + int32_t refs; + gf_boolean_t notify = _gf_false; + + LOCK(&fop->lock); + + ec_trace("RELEASE", fop, ""); + + GF_ASSERT(fop->refs > 0); + refs = --fop->refs; + + UNLOCK(&fop->lock); + + if (refs == 0) { + fop->frame->local = NULL; + STACK_DESTROY(fop->frame->root); + + LOCK_DESTROY(&fop->lock); + + if (fop->xdata != NULL) { + dict_unref(fop->xdata); + } + if (fop->dict != NULL) { + dict_unref(fop->dict); + } + if (fop->inode != NULL) { + inode_unref(fop->inode); + } + if (fop->fd != NULL) { + fd_unref(fop->fd); + } + if (fop->buffers != NULL) { + iobref_unref(fop->buffers); + } + GF_FREE(fop->vector); + GF_FREE(fop->str[0]); + GF_FREE(fop->str[1]); + loc_wipe(&fop->loc[0]); + loc_wipe(&fop->loc[1]); + GF_FREE(fop->errstr); + + ec_resume_parent(fop); + + ec_fop_cleanup(fop); + + ec = fop->xl->private; + ec_handle_last_pending_fop_completion(fop, ¬ify); + ec_handle_healers_done(fop); + mem_put(fop); + if (notify) { + ec_pending_fops_completed(ec); + } + } +} diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h new file mode 100644 index 00000000000..c8a74ffe1ed --- /dev/null +++ b/xlators/cluster/ec/src/ec-data.h @@ -0,0 +1,35 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_DATA_H__ +#define __EC_DATA_H__ + +#include "ec-types.h" + +ec_cbk_data_t * +ec_cbk_data_allocate(call_frame_t *frame, xlator_t *this, ec_fop_data_t *fop, + int32_t id, int32_t idx, int32_t op_ret, int32_t op_errno); +ec_fop_data_t * +ec_fop_data_allocate(call_frame_t *frame, xlator_t *this, int32_t id, + uint32_t flags, uintptr_t target, uint32_t fop_flags, + ec_wind_f wind, ec_handler_f handler, ec_cbk_t cbks, + void *data); +void +ec_fop_data_acquire(ec_fop_data_t *fop); +void +ec_fop_data_release(ec_fop_data_t *fop); + +void +ec_fop_cleanup(ec_fop_data_t *fop); + +void +ec_pending_fops_completed(ec_t *ec); + +#endif /* __EC_DATA_H__ */ diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c new file mode 100644 index 00000000000..f71dcfac293 --- /dev/null +++ b/xlators/cluster/ec/src/ec-dir-read.c @@ -0,0 +1,647 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "ec.h" +#include "ec-messages.h" +#include "ec-helpers.h" +#include "ec-common.h" +#include "ec-combine.h" +#include "ec-fops.h" + +/**************************************************************** + * + * File Operation: opendir + * + ***************************************************************/ + +int32_t +ec_combine_opendir(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src) +{ + if (dst->fd != src->fd) { + gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_FD_MISMATCH, + "Mismatching fd in answers " + "of 'GF_FOP_OPENDIR': %p <-> %p", + dst->fd, src->fd); + + return 0; + } + + return 1; +} + +int32_t +ec_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_OPENDIR, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (op_ret >= 0) { + if (fd != NULL) { + cbk->fd = fd_ref(fd); + if (cbk->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, + EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + } + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + ec_combine(cbk, ec_combine_opendir); + + ec_update_fd_status(fd, this, idx, op_ret); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_opendir(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_opendir_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->opendir, + &fop->loc[0], fop->fd, fop->xdata); +} + +int32_t +ec_manager_opendir(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + ec_fd_t *ctx; + int32_t err; + + switch (state) { + case EC_STATE_INIT: + LOCK(&fop->fd->lock); + + ctx = __ec_fd_get(fop->fd, fop->xl); + if (ctx == NULL) { + UNLOCK(&fop->fd->lock); + + fop->error = ENOMEM; + + return EC_STATE_REPORT; + } + if (!ctx->loc.inode) { + err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]); + if (err != 0) { + UNLOCK(&fop->fd->lock); + + fop->error = -err; + + return EC_STATE_REPORT; + } + } + + UNLOCK(&fop->fd->lock); + + /* Fall through */ + + case EC_STATE_LOCK: + ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0, + EC_RANGE_FULL); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_true); + if (cbk != NULL) { + /* Save which subvolumes successfully opened the directory. + * If ctx->open is 0, it means that readdir cannot be + * processed in this directory. + */ + LOCK(&fop->fd->lock); + + ctx = __ec_fd_get(fop->fd, fop->xl); + if (ctx != NULL) { + ctx->open |= cbk->mask; + } + + UNLOCK(&fop->fd->lock); + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.opendir != NULL) { + fop->cbks.opendir(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, cbk->fd, cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.opendir != NULL) { + fop->cbks.opendir(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_opendir(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_opendir_cbk_t func, void *data, loc_t *loc, + fd_t *fd, dict_t *xdata) +{ + ec_cbk_t callback = {.opendir = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(OPENDIR) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_OPENDIR, EC_FLAG_LOCK_SHARED, + target, fop_flags, ec_wind_opendir, + ec_manager_opendir, callback, data); + if (fop == NULL) { + goto out; + } + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL); + } +} + +/* Returns -1 if client_id is invalid else index of child subvol in xl_list */ +int +ec_deitransform(xlator_t *this, off_t offset) +{ + int idx = -1; + int client_id = -1; + ec_t *ec = this->private; + char id[32] = {0}; + int err; + + client_id = gf_deitransform(this, offset); + sprintf(id, "%d", client_id); + err = dict_get_int32(ec->leaf_to_subvolid, id, &idx); + if (err < 0) { + idx = err; + goto out; + } + +out: + if (idx < 0) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_REQUEST, + "Invalid index %d in readdirp request", client_id); + idx = -EINVAL; + } + return idx; +} + +/* FOP: readdir */ + +void +ec_adjust_readdirp(ec_t *ec, int32_t idx, gf_dirent_t *entries) +{ + gf_dirent_t *entry; + + list_for_each_entry(entry, &entries->list, list) + { + if (!entry->inode) + continue; + + if (entry->d_stat.ia_type == IA_IFREG) { + if ((entry->dict == NULL) || + (ec_dict_del_number(entry->dict, EC_XATTR_SIZE, + &entry->d_stat.ia_size) != 0)) { + inode_unref(entry->inode); + entry->inode = NULL; + } else { + ec_iatt_rebuild(ec, &entry->d_stat, 1, 1); + } + } + } +} + +int32_t +ec_common_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, fop->id, idx, op_ret, + op_errno); + if (cbk) { + if (xdata) + cbk->xdata = dict_ref(xdata); + if (cbk->op_ret >= 0) + list_splice_init(&entries->list, &cbk->entries.list); + ec_combine(cbk, NULL); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_readdir(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_common_readdir_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->readdir, + fop->fd, fop->size, fop->offset, fop->xdata); +} + +int32_t +ec_manager_readdir(ec_fop_data_t *fop, int32_t state) +{ + ec_fd_t *ctx = NULL; + ec_cbk_data_t *cbk = NULL; + + switch (state) { + case EC_STATE_INIT: + /* Return error if opendir has not been successfully called on + * any subvolume. */ + ctx = ec_fd_get(fop->fd, fop->xl); + if (ctx == NULL) { + fop->error = ENOMEM; + } else if (ctx->open == 0) { + fop->error = EBADFD; + } + + if (fop->error) { + gf_msg(fop->xl->name, GF_LOG_ERROR, fop->error, + EC_MSG_INVALID_REQUEST, "EC is not winding readdir: %s", + ec_msg_str(fop)); + return EC_STATE_REPORT; + } + + if (fop->id == GF_FOP_READDIRP) { + int32_t err; + + if (fop->xdata == NULL) { + fop->xdata = dict_new(); + if (fop->xdata == NULL) { + fop->error = ENOMEM; + + return EC_STATE_REPORT; + } + } + + err = dict_set_uint64(fop->xdata, EC_XATTR_SIZE, 0); + if (err != 0) { + fop->error = -err; + + return EC_STATE_REPORT; + } + } + + if (fop->offset != 0) { + /* Non-zero offset is irrecoverable error as the offset may not + * be valid on other bricks*/ + int32_t idx = -1; + + idx = ec_deitransform(fop->xl, fop->offset); + + if (idx < 0) { + fop->error = -idx; + return EC_STATE_REPORT; + } + fop->mask &= 1ULL << idx; + } else { + ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0, + EC_RANGE_FULL); + ec_lock(fop); + } + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_one(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + if (ec_dispatch_one_retry(fop, &cbk)) { + return EC_STATE_DISPATCH; + } + + if ((cbk != NULL) && (cbk->op_ret > 0) && + (fop->id == GF_FOP_READDIRP)) { + ec_adjust_readdirp(fop->xl->private, cbk->idx, &cbk->entries); + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + GF_ASSERT(cbk); + if (fop->id == GF_FOP_READDIR) { + if (fop->cbks.readdir != NULL) { + fop->cbks.readdir(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, &cbk->entries, cbk->xdata); + } + } else { + if (fop->cbks.readdirp != NULL) { + fop->cbks.readdirp(fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, + &cbk->entries, cbk->xdata); + } + } + if (fop->offset == 0) + return EC_STATE_LOCK_REUSE; + else + return EC_STATE_END; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + if (fop->id == GF_FOP_READDIR) { + if (fop->cbks.readdir != NULL) { + fop->cbks.readdir(fop->req_frame, fop, fop->xl, -1, + fop->error, NULL, NULL); + } + } else { + if (fop->cbks.readdirp != NULL) { + fop->cbks.readdirp(fop->req_frame, fop, fop->xl, -1, + fop->error, NULL, NULL); + } + } + if (fop->offset == 0) + return EC_STATE_LOCK_REUSE; + else + return EC_STATE_END; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + GF_ASSERT(fop->offset == 0); + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + GF_ASSERT(fop->offset == 0); + ec_unlock(fop); + + return EC_STATE_END; + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_readdir(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_readdir_cbk_t func, void *data, fd_t *fd, + size_t size, off_t offset, dict_t *xdata) +{ + ec_cbk_t callback = {.readdir = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(READDIR) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_READDIR, EC_FLAG_LOCK_SHARED, + target, fop_flags, ec_wind_readdir, + ec_manager_readdir, callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + fop->size = size; + fop->offset = offset; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL); + } +} + +/* FOP: readdirp */ + +void +ec_wind_readdirp(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_common_readdir_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->readdirp, + fop->fd, fop->size, fop->offset, fop->xdata); +} + +void +ec_readdirp(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_readdirp_cbk_t func, void *data, fd_t *fd, + size_t size, off_t offset, dict_t *xdata) +{ + ec_cbk_t callback = {.readdirp = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(READDIRP) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate( + frame, this, GF_FOP_READDIRP, EC_FLAG_LOCK_SHARED, target, fop_flags, + ec_wind_readdirp, ec_manager_readdir, callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + fop->size = size; + fop->offset = offset; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL); + } +} diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c new file mode 100644 index 00000000000..53d27d895c3 --- /dev/null +++ b/xlators/cluster/ec/src/ec-dir-write.c @@ -0,0 +1,1487 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "ec.h" +#include "ec-messages.h" +#include "ec-helpers.h" +#include "ec-common.h" +#include "ec-combine.h" +#include "ec-method.h" +#include "ec-fops.h" + +int +ec_dir_write_cbk(call_frame_t *frame, xlator_t *this, void *cookie, int op_ret, + int op_errno, struct iatt *poststat, struct iatt *preparent, + struct iatt *postparent, struct iatt *preparent2, + struct iatt *postparent2, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int i = 0; + int idx = 0; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + idx = (long)cookie; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, fop->id, idx, op_ret, + op_errno); + if (!cbk) + goto out; + + if (xdata) + cbk->xdata = dict_ref(xdata); + + if (op_ret < 0) + goto out; + + if (poststat) + cbk->iatt[i++] = *poststat; + + if (preparent) + cbk->iatt[i++] = *preparent; + + if (postparent) + cbk->iatt[i++] = *postparent; + + if (preparent2) + cbk->iatt[i++] = *preparent2; + + if (postparent2) + cbk->iatt[i++] = *postparent2; + +out: + if (cbk) + ec_combine(cbk, ec_combine_write); + + if (fop) + ec_complete(fop); + return 0; +} + +/* FOP: create */ + +int32_t +ec_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); +} + +void +ec_wind_create(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_create_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->create, + &fop->loc[0], fop->int32, fop->mode[0], fop->mode[1], + fop->fd, fop->xdata); +} + +int32_t +ec_manager_create(ec_fop_data_t *fop, int32_t state) +{ + ec_config_t config; + ec_t *ec; + ec_cbk_data_t *cbk; + ec_fd_t *ctx; + uint64_t version[2] = {0, 0}; + int32_t err; + + switch (state) { + case EC_STATE_INIT: + LOCK(&fop->fd->lock); + + ctx = __ec_fd_get(fop->fd, fop->xl); + if (ctx == NULL) { + UNLOCK(&fop->fd->lock); + + fop->error = ENOMEM; + + return EC_STATE_REPORT; + } + err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]); + if (err != 0) { + UNLOCK(&fop->fd->lock); + + fop->error = -err; + + return EC_STATE_REPORT; + } + + ctx->flags = fop->int32; + + UNLOCK(&fop->fd->lock); + + if (fop->xdata == NULL) { + fop->xdata = dict_new(); + if (fop->xdata == NULL) { + fop->error = ENOMEM; + + return EC_STATE_REPORT; + } + } + + ec = fop->xl->private; + + config.version = EC_CONFIG_VERSION; + config.algorithm = EC_CONFIG_ALGORITHM; + config.gf_word_size = EC_GF_BITS; + config.bricks = ec->nodes; + config.redundancy = ec->redundancy; + config.chunk_size = EC_METHOD_CHUNK_SIZE; + + err = ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG, &config); + if (err != 0) { + fop->error = -err; + + return EC_STATE_REPORT; + } + err = ec_dict_set_array(fop->xdata, EC_XATTR_VERSION, version, + EC_VERSION_SIZE); + if (err != 0) { + fop->error = -err; + + return EC_STATE_REPORT; + } + err = ec_dict_set_number(fop->xdata, EC_XATTR_SIZE, 0); + if (err != 0) { + fop->error = -err; + + return EC_STATE_REPORT; + } + + /* We need to write to specific offsets on the bricks, so we + * need to remove O_APPEND from flags (if present) */ + fop->int32 &= ~O_APPEND; + + /* Fall through */ + + case EC_STATE_LOCK: + ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL, + EC_UPDATE_DATA | EC_UPDATE_META); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_false); + if (cbk != NULL) { + int32_t err; + + ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count); + + err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode, + &cbk->iatt[0]); + if (!ec_cbk_set_error(cbk, -err, _gf_false)) { + LOCK(&fop->fd->lock); + + ctx = __ec_fd_get(fop->fd, fop->xl); + if (ctx != NULL) { + ctx->open |= cbk->mask; + } + + UNLOCK(&fop->fd->lock); + } + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.create != NULL) { + QUORUM_CBK(fop->cbks.create, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, fop->fd, + fop->loc[0].inode, &cbk->iatt[0], &cbk->iatt[1], + &cbk->iatt[2], cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.create != NULL) { + fop->cbks.create(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL, NULL, NULL, NULL, NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_create(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_create_cbk_t func, void *data, loc_t *loc, + int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + ec_cbk_t callback = {.create = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(CREATE) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_CREATE, 0, target, fop_flags, + ec_wind_create, ec_manager_create, callback, + data); + if (fop == NULL) { + goto out; + } + + fop->int32 = flags; + fop->mode[0] = mode; + fop->mode[1] = umask; + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL, NULL); + } +} + +/* FOP: link */ + +int32_t +ec_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); +} + +void +ec_wind_link(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_link_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->link, + &fop->loc[0], &fop->loc[1], fop->xdata); +} + +int32_t +ec_manager_link(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_LOCK: + ec_lock_prepare_parent_inode( + fop, &fop->loc[1], &fop->loc[0], + EC_UPDATE_DATA | EC_UPDATE_META | EC_INODE_SIZE); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_false); + if (cbk != NULL) { + int32_t err; + + ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count); + + if (cbk->iatt[0].ia_type == IA_IFREG) { + cbk->iatt[0].ia_size = fop->locks[0].size; + } + + err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode, + &cbk->iatt[0]); + ec_cbk_set_error(cbk, -err, _gf_false); + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.link != NULL) { + QUORUM_CBK(fop->cbks.link, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, fop->loc[0].inode, + &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2], + cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.link != NULL) { + fop->cbks.link(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL, NULL, NULL, NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_link(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_link_cbk_t func, void *data, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + ec_cbk_t callback = {.link = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(LINK) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_LINK, 0, target, fop_flags, + ec_wind_link, ec_manager_link, callback, data); + if (fop == NULL) { + goto out; + } + + if (oldloc != NULL) { + if (loc_copy(&fop->loc[0], oldloc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (newloc != NULL) { + if (loc_copy(&fop->loc[1], newloc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL); + } +} + +/* FOP: mkdir */ + +int32_t +ec_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); +} + +void +ec_wind_mkdir(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_mkdir_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->mkdir, + &fop->loc[0], fop->mode[0], fop->mode[1], fop->xdata); +} + +int32_t +ec_manager_mkdir(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + uint64_t version[2] = {0, 0}; + int32_t err; + + switch (state) { + case EC_STATE_INIT: + if (fop->xdata == NULL) { + fop->xdata = dict_new(); + if (fop->xdata == NULL) { + fop->error = ENOMEM; + + return EC_STATE_REPORT; + } + } + + err = ec_dict_set_array(fop->xdata, EC_XATTR_VERSION, version, + EC_VERSION_SIZE); + if (err != 0) { + fop->error = -err; + return EC_STATE_REPORT; + } + + /* Fall through */ + + case EC_STATE_LOCK: + ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL, + EC_UPDATE_DATA | EC_UPDATE_META); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_false); + if (cbk != NULL) { + int32_t err; + + ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count); + + err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode, + &cbk->iatt[0]); + ec_cbk_set_error(cbk, -err, _gf_false); + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.mkdir != NULL) { + QUORUM_CBK(fop->cbks.mkdir, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, fop->loc[0].inode, + &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2], + cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + cbk = fop->answer; + GF_ASSERT(fop->error != 0); + + if (fop->cbks.mkdir != NULL) { + fop->cbks.mkdir(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL, NULL, NULL, + ((cbk) ? cbk->xdata : NULL)); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_mkdir(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_mkdir_cbk_t func, void *data, loc_t *loc, + mode_t mode, mode_t umask, dict_t *xdata) +{ + ec_cbk_t callback = {.mkdir = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(MKDIR) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_MKDIR, 0, target, fop_flags, + ec_wind_mkdir, ec_manager_mkdir, callback, data); + if (fop == NULL) { + goto out; + } + + fop->mode[0] = mode; + fop->mode[1] = umask; + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL); + } +} + +/* FOP: mknod */ + +int32_t +ec_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); +} + +void +ec_wind_mknod(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_mknod_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->mknod, + &fop->loc[0], fop->mode[0], fop->dev, fop->mode[1], + fop->xdata); +} + +int32_t +ec_manager_mknod(ec_fop_data_t *fop, int32_t state) +{ + ec_config_t config; + ec_t *ec; + ec_cbk_data_t *cbk; + uint64_t version[2] = {0, 0}; + + switch (state) { + case EC_STATE_INIT: + if (S_ISREG(fop->mode[0])) { + int32_t err; + + if (fop->xdata == NULL) { + fop->xdata = dict_new(); + if (fop->xdata == NULL) { + fop->error = ENOMEM; + + return EC_STATE_REPORT; + } + } + + ec = fop->xl->private; + + config.version = EC_CONFIG_VERSION; + config.algorithm = EC_CONFIG_ALGORITHM; + config.gf_word_size = EC_GF_BITS; + config.bricks = ec->nodes; + config.redundancy = ec->redundancy; + config.chunk_size = EC_METHOD_CHUNK_SIZE; + + err = ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG, &config); + if (err != 0) { + fop->error = -err; + + return EC_STATE_REPORT; + } + err = ec_dict_set_array(fop->xdata, EC_XATTR_VERSION, version, + EC_VERSION_SIZE); + if (err != 0) { + fop->error = -err; + + return EC_STATE_REPORT; + } + err = ec_dict_set_number(fop->xdata, EC_XATTR_SIZE, 0); + if (err != 0) { + fop->error = -err; + + return EC_STATE_REPORT; + } + } + + /* Fall through */ + + case EC_STATE_LOCK: + ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL, + EC_UPDATE_DATA | EC_UPDATE_META); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_false); + if (cbk != NULL) { + int32_t err; + + ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count); + + err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode, + &cbk->iatt[0]); + ec_cbk_set_error(cbk, -err, _gf_false); + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.mknod != NULL) { + QUORUM_CBK(fop->cbks.mknod, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, fop->loc[0].inode, + &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2], + cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.mknod != NULL) { + fop->cbks.mknod(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL, NULL, NULL, NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_mknod(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_mknod_cbk_t func, void *data, loc_t *loc, + mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) +{ + ec_cbk_t callback = {.mknod = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(MKNOD) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_MKNOD, 0, target, fop_flags, + ec_wind_mknod, ec_manager_mknod, callback, data); + if (fop == NULL) { + goto out; + } + + fop->mode[0] = mode; + fop->dev = rdev; + fop->mode[1] = umask; + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL); + } +} + +/* FOP: rename */ + +int32_t +ec_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, struct iatt *preoldparent, + struct iatt *postoldparent, struct iatt *prenewparent, + struct iatt *postnewparent, dict_t *xdata) +{ + return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf, + preoldparent, postoldparent, prenewparent, + postnewparent, xdata); +} + +void +ec_wind_rename(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_rename_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->rename, + &fop->loc[0], &fop->loc[1], fop->xdata); +} + +int32_t +ec_manager_rename(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_LOCK: + ec_lock_prepare_parent_inode( + fop, &fop->loc[0], &fop->loc[0], + EC_UPDATE_DATA | EC_UPDATE_META | EC_INODE_SIZE); + ec_lock_prepare_parent_inode(fop, &fop->loc[1], NULL, + EC_UPDATE_DATA | EC_UPDATE_META); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_false); + if (cbk != NULL) { + ec_iatt_rebuild(fop->xl->private, cbk->iatt, 5, cbk->count); + + if (cbk->iatt[0].ia_type == IA_IFREG) { + cbk->iatt[0].ia_size = fop->locks[0].size; + } + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.rename != NULL) { + QUORUM_CBK(fop->cbks.rename, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], &cbk->iatt[2], &cbk->iatt[3], + &cbk->iatt[4], cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.rename != NULL) { + fop->cbks.rename(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL, NULL, NULL, NULL, NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_rename(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_rename_cbk_t func, void *data, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + ec_cbk_t callback = {.rename = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(RENAME) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_RENAME, 0, target, fop_flags, + ec_wind_rename, ec_manager_rename, callback, + data); + if (fop == NULL) { + goto out; + } + + if (oldloc != NULL) { + if (loc_copy(&fop->loc[0], oldloc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (newloc != NULL) { + if (loc_copy(&fop->loc[1], newloc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL, NULL); + } +} + +/* FOP: rmdir */ + +int32_t +ec_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); +} + +void +ec_wind_rmdir(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_rmdir_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->rmdir, + &fop->loc[0], fop->int32, fop->xdata); +} + +int32_t +ec_manager_rmdir(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_LOCK: + ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL, + EC_UPDATE_DATA | EC_UPDATE_META); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + ec_fop_prepare_answer(fop, _gf_false); + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.rmdir != NULL) { + QUORUM_CBK(fop->cbks.rmdir, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.rmdir != NULL) { + fop->cbks.rmdir(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL, NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_rmdir(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_rmdir_cbk_t func, void *data, loc_t *loc, + int xflags, dict_t *xdata) +{ + ec_cbk_t callback = {.rmdir = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(RMDIR) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_RMDIR, 0, target, fop_flags, + ec_wind_rmdir, ec_manager_rmdir, callback, data); + if (fop == NULL) { + goto out; + } + + fop->int32 = xflags; + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL); + } +} + +/* FOP: symlink */ + +int32_t +ec_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); +} + +void +ec_wind_symlink(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_symlink_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->symlink, + fop->str[0], &fop->loc[0], fop->mode[0], fop->xdata); +} + +int32_t +ec_manager_symlink(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_LOCK: + ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL, + EC_UPDATE_DATA | EC_UPDATE_META); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_false); + if (cbk != NULL) { + int32_t err; + + ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count); + + err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode, + &cbk->iatt[0]); + ec_cbk_set_error(cbk, -err, _gf_false); + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.symlink != NULL) { + QUORUM_CBK(fop->cbks.symlink, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, fop->loc[0].inode, + &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2], + cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.symlink != NULL) { + fop->cbks.symlink(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL, NULL, NULL, NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_symlink(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_symlink_cbk_t func, void *data, + const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata) +{ + ec_cbk_t callback = {.symlink = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(SYMLINK) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_SYMLINK, 0, target, + fop_flags, ec_wind_symlink, ec_manager_symlink, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->mode[0] = umask; + + if (linkname != NULL) { + fop->str[0] = gf_strdup(linkname); + if (fop->str[0] == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to duplicate a string."); + + goto out; + } + } + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL); + } +} + +/* FOP: unlink */ + +int32_t +ec_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); +} + +void +ec_wind_unlink(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_unlink_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->unlink, + &fop->loc[0], fop->int32, fop->xdata); +} + +int32_t +ec_manager_unlink(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_LOCK: + ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL, + EC_UPDATE_DATA | EC_UPDATE_META); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + ec_fop_prepare_answer(fop, _gf_false); + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.unlink != NULL) { + QUORUM_CBK(fop->cbks.unlink, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.unlink != NULL) { + fop->cbks.unlink(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL, NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_unlink(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_unlink_cbk_t func, void *data, loc_t *loc, + int xflags, dict_t *xdata) +{ + ec_cbk_t callback = {.unlink = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(UNLINK) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_UNLINK, 0, target, fop_flags, + ec_wind_unlink, ec_manager_unlink, callback, + data); + if (fop == NULL) { + goto out; + } + + fop->int32 = xflags; + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL); + } +} diff --git a/xlators/cluster/ec/src/ec-fops.h b/xlators/cluster/ec/src/ec-fops.h new file mode 100644 index 00000000000..07edf8a7fec --- /dev/null +++ b/xlators/cluster/ec/src/ec-fops.h @@ -0,0 +1,254 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_FOPS_H__ +#define __EC_FOPS_H__ + +#include <glusterfs/xlator.h> + +#include "ec-types.h" +#include "ec-common.h" + +void +ec_access(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_access_cbk_t func, void *data, loc_t *loc, + int32_t mask, dict_t *xdata); + +void +ec_create(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_create_cbk_t func, void *data, loc_t *loc, + int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata); + +void +ec_entrylk(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_entrylk_cbk_t func, void *data, + const char *volume, loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata); + +void +ec_fentrylk(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fentrylk_cbk_t func, void *data, + const char *volume, fd_t *fd, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata); + +void +ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_flush_cbk_t func, void *data, fd_t *fd, + dict_t *xdata); + +void +ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fsync_cbk_t func, void *data, fd_t *fd, + int32_t datasync, dict_t *xdata); + +void +ec_fsyncdir(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fsyncdir_cbk_t func, void *data, fd_t *fd, + int32_t datasync, dict_t *xdata); + +void +ec_getxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_getxattr_cbk_t func, void *data, loc_t *loc, + const char *name, dict_t *xdata); + +void +ec_fgetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fgetxattr_cbk_t func, void *data, fd_t *fd, + const char *name, dict_t *xdata); + +void +ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_heal_cbk_t func, void *data, loc_t *loc, + int32_t partial, dict_t *xdata); + +void +ec_fheal(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fheal_cbk_t func, void *data, fd_t *fd, + int32_t partial, dict_t *xdata); + +void +ec_inodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner, + uintptr_t target, uint32_t fop_flags, fop_inodelk_cbk_t func, + void *data, const char *volume, loc_t *loc, int32_t cmd, + struct gf_flock *flock, dict_t *xdata); + +void +ec_finodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner, + uintptr_t target, uint32_t fop_flags, fop_finodelk_cbk_t func, + void *data, const char *volume, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata); + +void +ec_link(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_link_cbk_t func, void *data, loc_t *oldloc, + loc_t *newloc, dict_t *xdata); + +void +ec_lk(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags, + fop_lk_cbk_t func, void *data, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata); + +void +ec_lookup(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_lookup_cbk_t func, void *data, loc_t *loc, + dict_t *xdata); + +void +ec_mkdir(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_mkdir_cbk_t func, void *data, loc_t *loc, + mode_t mode, mode_t umask, dict_t *xdata); + +void +ec_mknod(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_mknod_cbk_t func, void *data, loc_t *loc, + mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata); + +void +ec_open(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_open_cbk_t func, void *data, loc_t *loc, + int32_t flags, fd_t *fd, dict_t *xdata); + +void +ec_opendir(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_opendir_cbk_t func, void *data, loc_t *loc, + fd_t *fd, dict_t *xdata); + +void +ec_readdir(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_readdir_cbk_t func, void *data, fd_t *fd, + size_t size, off_t offset, dict_t *xdata); + +void +ec_readdirp(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_readdirp_cbk_t func, void *data, fd_t *fd, + size_t size, off_t offset, dict_t *xdata); + +void +ec_readlink(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_readlink_cbk_t func, void *data, loc_t *loc, + size_t size, dict_t *xdata); + +void +ec_readv(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_readv_cbk_t func, void *data, fd_t *fd, + size_t size, off_t offset, uint32_t flags, dict_t *xdata); + +void +ec_removexattr(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_removexattr_cbk_t func, void *data, + loc_t *loc, const char *name, dict_t *xdata); + +void +ec_fremovexattr(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fremovexattr_cbk_t func, void *data, + fd_t *fd, const char *name, dict_t *xdata); + +void +ec_rename(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_rename_cbk_t func, void *data, loc_t *oldloc, + loc_t *newloc, dict_t *xdata); + +void +ec_rmdir(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_rmdir_cbk_t func, void *data, loc_t *loc, + int xflags, dict_t *xdata); + +void +ec_setattr(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_setattr_cbk_t func, void *data, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata); + +void +ec_fsetattr(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fsetattr_cbk_t func, void *data, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata); + +void +ec_setxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_setxattr_cbk_t func, void *data, loc_t *loc, + dict_t *dict, int32_t flags, dict_t *xdata); + +void +ec_fsetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fsetxattr_cbk_t func, void *data, fd_t *fd, + dict_t *dict, int32_t flags, dict_t *xdata); + +void +ec_stat(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_stat_cbk_t func, void *data, loc_t *loc, + dict_t *xdata); + +void +ec_fstat(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fstat_cbk_t func, void *data, fd_t *fd, + dict_t *xdata); + +void +ec_statfs(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_statfs_cbk_t func, void *data, loc_t *loc, + dict_t *xdata); + +void +ec_symlink(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_symlink_cbk_t func, void *data, + const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata); + +void +ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fallocate_cbk_t func, void *data, fd_t *fd, + int32_t mode, off_t offset, size_t len, dict_t *xdata); + +void +ec_discard(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_discard_cbk_t func, void *data, fd_t *fd, + off_t offset, size_t len, dict_t *xdata); + +void +ec_truncate(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_truncate_cbk_t func, void *data, loc_t *loc, + off_t offset, dict_t *xdata); + +void +ec_ftruncate(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_ftruncate_cbk_t func, void *data, fd_t *fd, + off_t offset, dict_t *xdata); + +void +ec_unlink(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_unlink_cbk_t func, void *data, loc_t *loc, + int xflags, dict_t *xdata); + +void +ec_writev(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_writev_cbk_t func, void *data, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata); + +void +ec_xattrop(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_xattrop_cbk_t func, void *data, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata); + +void +ec_fxattrop(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fxattrop_cbk_t func, void *data, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata); + +void +ec_seek(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_seek_cbk_t func, void *data, fd_t *fd, + off_t offset, gf_seek_what_t what, dict_t *xdata); + +void +ec_ipc(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_ipc_cbk_t func, void *data, int32_t op, + dict_t *xdata); + +#endif /* __EC_FOPS_H__ */ diff --git a/xlators/cluster/ec/src/ec-galois.c b/xlators/cluster/ec/src/ec-galois.c new file mode 100644 index 00000000000..6e4990c71f5 --- /dev/null +++ b/xlators/cluster/ec/src/ec-galois.c @@ -0,0 +1,183 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <string.h> + +#include "ec-mem-types.h" +#include "ec-gf8.h" +#include "ec-helpers.h" + +static ec_gf_t * +ec_gf_alloc(uint32_t bits, uint32_t mod) +{ + ec_gf_t *gf; + + gf = GF_MALLOC(sizeof(ec_gf_t), ec_mt_ec_gf_t); + if (gf == NULL) { + goto failed; + } + + gf->bits = bits; + gf->size = 1 << bits; + gf->mod = mod; + + gf->log = GF_MALLOC(sizeof(uint32_t) * (gf->size * 2 - 1), + gf_common_mt_int); + if (gf->log == NULL) { + goto failed_gf; + } + gf->pow = GF_MALLOC(sizeof(uint32_t) * (gf->size * 2 - 1), + gf_common_mt_int); + if (gf->pow == NULL) { + goto failed_log; + } + + return gf; + +failed_log: + GF_FREE(gf->log); +failed_gf: + GF_FREE(gf); +failed: + return EC_ERR(ENOMEM); +} + +static void +ec_gf_init_tables(ec_gf_t *gf) +{ + uint32_t i, tmp; + + memset(gf->log, -1, sizeof(uint32_t) * gf->size); + + gf->pow[0] = 1; + gf->log[0] = gf->size; + gf->log[1] = 0; + for (i = 1; i < gf->size; i++) { + tmp = gf->pow[i - 1] << 1; + if (tmp >= gf->size) { + tmp ^= gf->mod; + } + gf->pow[i + gf->size - 1] = gf->pow[i] = tmp; + gf->log[tmp + gf->size - 1] = gf->log[tmp] = i; + } +} + +ec_gf_t * +ec_gf_prepare(uint32_t bits, uint32_t mod) +{ + ec_gf_mul_t **tbl; + ec_gf_t *gf; + uint32_t i, j; + + if (bits != 8) { + return EC_ERR(EINVAL); + } + + tbl = ec_gf8_mul; + if (mod == 0) { + mod = 0x11d; + } + + gf = ec_gf_alloc(bits, mod); + if (EC_IS_ERR(gf)) { + return gf; + } + ec_gf_init_tables(gf); + + gf->table = tbl; + gf->min_ops = bits * bits; + gf->max_ops = 0; + gf->avg_ops = 0; + for (i = 1; i < gf->size; i++) { + for (j = 0; tbl[i]->ops[j].op != EC_GF_OP_END; j++) { + } + if (gf->max_ops < j) { + gf->max_ops = j; + } + if (gf->min_ops > j) { + gf->min_ops = j; + } + gf->avg_ops += j; + } + gf->avg_ops /= gf->size; + + return gf; +} + +void +ec_gf_destroy(ec_gf_t *gf) +{ + GF_FREE(gf->pow); + GF_FREE(gf->log); + GF_FREE(gf); +} + +uint32_t +ec_gf_add(ec_gf_t *gf, uint32_t a, uint32_t b) +{ + if ((a >= gf->size) || (b >= gf->size)) { + return gf->size; + } + + return a ^ b; +} + +uint32_t +ec_gf_mul(ec_gf_t *gf, uint32_t a, uint32_t b) +{ + if ((a >= gf->size) || (b >= gf->size)) { + return gf->size; + } + + if ((a != 0) && (b != 0)) { + return gf->pow[gf->log[a] + gf->log[b]]; + } + + return 0; +} + +uint32_t +ec_gf_div(ec_gf_t *gf, uint32_t a, uint32_t b) +{ + if ((a >= gf->size) || (b >= gf->size)) { + return gf->size; + } + + if (b != 0) { + if (a != 0) { + return gf->pow[gf->size - 1 + gf->log[a] - gf->log[b]]; + } + + return 0; + } + + return gf->size; +} + +uint32_t +ec_gf_exp(ec_gf_t *gf, uint32_t a, uint32_t b) +{ + uint32_t r; + + if ((a >= gf->size) || ((a == 0) && (b == 0))) { + return gf->size; + } + + r = 1; + while (b != 0) { + if ((b & 1) != 0) { + r = ec_gf_mul(gf, r, a); + } + a = ec_gf_mul(gf, a, a); + b >>= 1; + } + + return r; +} diff --git a/xlators/cluster/ec/src/ec-galois.h b/xlators/cluster/ec/src/ec-galois.h new file mode 100644 index 00000000000..ed55d53e419 --- /dev/null +++ b/xlators/cluster/ec/src/ec-galois.h @@ -0,0 +1,32 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_GALOIS_H__ +#define __EC_GALOIS_H__ + +#include <inttypes.h> + +#include "ec-types.h" + +ec_gf_t * +ec_gf_prepare(uint32_t bits, uint32_t mod); +void +ec_gf_destroy(ec_gf_t *gf); + +uint32_t +ec_gf_add(ec_gf_t *gf, uint32_t a, uint32_t b); +uint32_t +ec_gf_mul(ec_gf_t *gf, uint32_t a, uint32_t b); +uint32_t +ec_gf_div(ec_gf_t *gf, uint32_t a, uint32_t b); +uint32_t +ec_gf_exp(ec_gf_t *gf, uint32_t a, uint32_t b); + +#endif /* __EC_GALOIS_H__ */ diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c new file mode 100644 index 00000000000..884deb93669 --- /dev/null +++ b/xlators/cluster/ec/src/ec-generic.c @@ -0,0 +1,1591 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <glusterfs/byte-order.h> + +#include "ec.h" +#include "ec-messages.h" +#include "ec-helpers.h" +#include "ec-common.h" +#include "ec-combine.h" +#include "ec-fops.h" + +/* FOP: flush */ + +int32_t +ec_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FLUSH, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + ec_combine(cbk, NULL); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_flush(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_flush_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->flush, fop->fd, + fop->xdata); +} + +int32_t +ec_manager_flush(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_LOCK: + ec_lock_prepare_fd(fop, fop->fd, 0, 0, EC_RANGE_FULL); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_flush_size_version(fop); + + return EC_STATE_DELAYED_START; + + case EC_STATE_DELAYED_START: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + ec_fop_prepare_answer(fop, _gf_false); + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.flush != NULL) { + fop->cbks.flush(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DELAYED_START: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.flush != NULL) { + fop->cbks.flush(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +static int32_t +ec_validate_fd(fd_t *fd, xlator_t *xl) +{ + uint64_t iversion = 0; + uint64_t fversion = 0; + ec_inode_t *inode_ctx = NULL; + ec_fd_t *fd_ctx = NULL; + + LOCK(&fd->lock); + { + fd_ctx = __ec_fd_get(fd, xl); + if (fd_ctx) { + fversion = fd_ctx->bad_version; + } + } + UNLOCK(&fd->lock); + + LOCK(&fd->inode->lock); + { + inode_ctx = __ec_inode_get(fd->inode, xl); + if (inode_ctx) { + iversion = inode_ctx->bad_version; + } + } + UNLOCK(&fd->inode->lock); + if (fversion < iversion) { + return EBADF; + } + return 0; +} + +void +ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_flush_cbk_t func, void *data, fd_t *fd, + dict_t *xdata) +{ + ec_cbk_t callback = {.flush = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(FLUSH) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + if (fd) { + error = ec_validate_fd(fd, this); + if (error) { + gf_msg(this->name, GF_LOG_ERROR, EBADF, EC_MSG_FD_BAD, + "Failing %s on %s", gf_fop_list[GF_FOP_FLUSH], + fd->inode ? uuid_utoa(fd->inode->gfid) : ""); + goto out; + } + } + + fop = ec_fop_data_allocate(frame, this, GF_FOP_FLUSH, 0, target, fop_flags, + ec_wind_flush, ec_manager_flush, callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL); + } +} + +/* FOP: fsync */ + +int32_t +ec_combine_fsync(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src) +{ + if (!ec_iatt_combine(fop, dst->iatt, src->iatt, 2)) { + gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_IATT_MISMATCH, + "Mismatching iatt in " + "answers of 'GF_FOP_FSYNC'"); + + return 0; + } + + return 1; +} + +int32_t +ec_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FSYNC, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (op_ret >= 0) { + if (prebuf != NULL) { + cbk->iatt[0] = *prebuf; + } + if (postbuf != NULL) { + cbk->iatt[1] = *postbuf; + } + } + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + ec_combine(cbk, ec_combine_fsync); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_fsync(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_fsync_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->fsync, fop->fd, + fop->int32, fop->xdata); +} + +int32_t +ec_manager_fsync(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_LOCK: + ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0, EC_RANGE_FULL); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_flush_size_version(fop); + + return EC_STATE_DELAYED_START; + + case EC_STATE_DELAYED_START: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_false); + if (cbk != NULL) { + ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); + + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, + &cbk->iatt[0].ia_size)); + cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.fsync != NULL) { + fop->cbks.fsync(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], + cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + case -EC_STATE_DELAYED_START: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.fsync != NULL) { + fop->cbks.fsync(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL, NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fsync_cbk_t func, void *data, fd_t *fd, + int32_t datasync, dict_t *xdata) +{ + ec_cbk_t callback = {.fsync = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(FSYNC) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + if (fd) { + error = ec_validate_fd(fd, this); + if (error) { + gf_msg(this->name, GF_LOG_ERROR, EBADF, EC_MSG_FD_BAD, + "Failing %s on %s", gf_fop_list[GF_FOP_FSYNC], + fd->inode ? uuid_utoa(fd->inode->gfid) : ""); + goto out; + } + } + + fop = ec_fop_data_allocate(frame, this, GF_FOP_FSYNC, 0, target, fop_flags, + ec_wind_fsync, ec_manager_fsync, callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + fop->int32 = datasync; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL); + } +} + +/* FOP: fsyncdir */ + +int32_t +ec_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FSYNCDIR, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + ec_combine(cbk, NULL); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_fsyncdir(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_fsyncdir_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->fsyncdir, + fop->fd, fop->int32, fop->xdata); +} + +int32_t +ec_manager_fsyncdir(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_LOCK: + ec_lock_prepare_fd(fop, fop->fd, 0, 0, EC_RANGE_FULL); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_flush_size_version(fop); + + return EC_STATE_DELAYED_START; + + case EC_STATE_DELAYED_START: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + ec_fop_prepare_answer(fop, _gf_false); + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.fsyncdir != NULL) { + fop->cbks.fsyncdir(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + case -EC_STATE_DELAYED_START: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.fsyncdir != NULL) { + fop->cbks.fsyncdir(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_fsyncdir(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fsyncdir_cbk_t func, void *data, fd_t *fd, + int32_t datasync, dict_t *xdata) +{ + ec_cbk_t callback = {.fsyncdir = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(FSYNCDIR) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_FSYNCDIR, 0, target, + fop_flags, ec_wind_fsyncdir, ec_manager_fsyncdir, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + fop->int32 = datasync; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL); + } +} + +/* FOP: lookup */ + +void +ec_lookup_rebuild(ec_t *ec, ec_fop_data_t *fop, ec_cbk_data_t *cbk) +{ + ec_inode_t *ctx = NULL; + uint64_t size = 0; + int32_t have_size = 0, err; + + if (cbk->op_ret < 0) { + return; + } + + ec_dict_del_array(cbk->xdata, EC_XATTR_VERSION, cbk->version, + EC_VERSION_SIZE); + + err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode, &cbk->iatt[0]); + if (ec_cbk_set_error(cbk, -err, _gf_true)) { + return; + } + + LOCK(&cbk->inode->lock); + + ctx = __ec_inode_get(cbk->inode, fop->xl); + if (ctx != NULL) { + if (ctx->have_version) { + cbk->version[0] = ctx->post_version[0]; + cbk->version[1] = ctx->post_version[1]; + } + if (ctx->have_size) { + size = ctx->post_size; + have_size = 1; + } + } + + UNLOCK(&cbk->inode->lock); + + if (cbk->iatt[0].ia_type == IA_IFREG) { + cbk->size = cbk->iatt[0].ia_size; + ec_dict_del_number(cbk->xdata, EC_XATTR_SIZE, &cbk->iatt[0].ia_size); + if (have_size) { + cbk->iatt[0].ia_size = size; + } + } +} + +int32_t +ec_combine_lookup(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src) +{ + if (!ec_iatt_combine(fop, dst->iatt, src->iatt, 2)) { + gf_msg(fop->xl->name, GF_LOG_DEBUG, 0, EC_MSG_IATT_MISMATCH, + "Mismatching iatt in " + "answers of 'GF_FOP_LOOKUP'"); + + return 0; + } + + return 1; +} + +int32_t +ec_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + uint64_t dirty[2] = {0}; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_LOOKUP, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (op_ret >= 0) { + if (inode != NULL) { + cbk->inode = inode_ref(inode); + if (cbk->inode == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_INODE_REF_FAIL, + "Failed to reference an inode."); + + goto out; + } + } + if (buf != NULL) { + cbk->iatt[0] = *buf; + } + if (postparent != NULL) { + cbk->iatt[1] = *postparent; + } + } + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + ec_dict_del_array(xdata, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE); + } + + ec_combine(cbk, ec_combine_lookup); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_lookup(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_lookup_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->lookup, + &fop->loc[0], fop->xdata); +} + +int32_t +ec_manager_lookup(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + int32_t err; + + switch (state) { + case EC_STATE_INIT: + if (fop->xdata == NULL) { + fop->xdata = dict_new(); + if (fop->xdata == NULL) { + gf_msg(fop->xl->name, GF_LOG_ERROR, ENOMEM, + EC_MSG_LOOKUP_REQ_PREP_FAIL, + "Unable to prepare " + "lookup request"); + + fop->error = ENOMEM; + + return EC_STATE_REPORT; + } + } else { + /*TODO: To be handled once we have 'syndromes' */ + dict_del(fop->xdata, GF_CONTENT_KEY); + } + err = dict_set_uint64(fop->xdata, EC_XATTR_SIZE, 0); + if (err == 0) { + err = dict_set_uint64(fop->xdata, EC_XATTR_VERSION, 0); + } + if (err == 0) { + err = dict_set_uint64(fop->xdata, EC_XATTR_DIRTY, 0); + } + if (err != 0) { + gf_msg(fop->xl->name, GF_LOG_ERROR, -err, + EC_MSG_LOOKUP_REQ_PREP_FAIL, + "Unable to prepare lookup " + "request"); + + fop->error = -err; + + return EC_STATE_REPORT; + } + + /* Fall through */ + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + /* + * Lookup happens without any lock, so there is a chance that it + * will have answers before modification happened and after + * modification happened in the same response. So choose the next + * best answer when the answers don't match for EC_MINIMUM_MIN + */ + + if (!fop->answer && !list_empty(&fop->cbk_list)) { + fop->answer = list_entry(fop->cbk_list.next, ec_cbk_data_t, + list); + } + + cbk = ec_fop_prepare_answer(fop, _gf_true); + if (cbk != NULL) { + ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); + + ec_lookup_rebuild(fop->xl->private, fop, cbk); + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.lookup != NULL) { + fop->cbks.lookup(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, cbk->inode, &cbk->iatt[0], + cbk->xdata, &cbk->iatt[1]); + } + + return EC_STATE_END; + + case -EC_STATE_INIT: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.lookup != NULL) { + fop->cbks.lookup(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL, NULL, NULL); + } + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_lookup(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_lookup_cbk_t func, void *data, loc_t *loc, + dict_t *xdata) +{ + ec_cbk_t callback = {.lookup = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(LOOKUP) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_LOOKUP, EC_FLAG_LOCK_SHARED, + target, fop_flags, ec_wind_lookup, + ec_manager_lookup, callback, data); + if (fop == NULL) { + goto out; + } + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + /* Do not log failures here as a memory problem would have already + * been logged by the corresponding alloc functions */ + if (fop->xdata == NULL) + goto out; + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL); + } +} + +/* FOP: statfs */ + +int32_t +ec_combine_statfs(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src) +{ + ec_statvfs_combine(&dst->statvfs, &src->statvfs); + + return 1; +} + +int32_t +ec_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct statvfs *buf, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_STATFS, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (op_ret >= 0) { + if (buf != NULL) { + cbk->statvfs = *buf; + } + } + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + ec_combine(cbk, ec_combine_statfs); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_statfs(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_statfs_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->statfs, + &fop->loc[0], fop->xdata); +} + +int32_t +ec_manager_statfs(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk = NULL; + gf_boolean_t deem_statfs_enabled = _gf_false; + int32_t err = 0; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_true); + if (cbk != NULL) { + ec_t *ec = fop->xl->private; + + if (cbk->xdata) { + err = dict_get_int8(cbk->xdata, "quota-deem-statfs", + (int8_t *)&deem_statfs_enabled); + if (err != -ENOENT) { + ec_cbk_set_error(cbk, -err, _gf_true); + } + } + + if (err != 0 || deem_statfs_enabled == _gf_false) { + cbk->statvfs.f_blocks *= ec->fragments; + cbk->statvfs.f_bfree *= ec->fragments; + cbk->statvfs.f_bavail *= ec->fragments; + } + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.statfs != NULL) { + fop->cbks.statfs(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, &cbk->statvfs, cbk->xdata); + } + + return EC_STATE_END; + + case -EC_STATE_INIT: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.statfs != NULL) { + fop->cbks.statfs(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL); + } + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_statfs(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_statfs_cbk_t func, void *data, loc_t *loc, + dict_t *xdata) +{ + ec_cbk_t callback = {.statfs = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(STATFS) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_STATFS, EC_FLAG_LOCK_SHARED, + target, fop_flags, ec_wind_statfs, + ec_manager_statfs, callback, data); + if (fop == NULL) { + goto out; + } + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL); + } +} + +/* FOP: xattrop */ + +int32_t +ec_combine_xattrop(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src) +{ + if (!ec_dict_compare(dst->dict, src->dict)) { + gf_msg(fop->xl->name, GF_LOG_DEBUG, 0, EC_MSG_DICT_MISMATCH, + "Mismatching dictionary in " + "answers of 'GF_FOP_XATTROP'"); + + return 0; + } + + return 1; +} + +int32_t +ec_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_lock_link_t *link = NULL; + ec_cbk_data_t *cbk = NULL; + uint64_t dirty[2] = {0}; + data_t *data; + uint64_t *version; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, fop->id, idx, op_ret, + op_errno); + if (!cbk) + goto out; + + if (op_ret >= 0) { + cbk->dict = dict_ref(xattr); + + data = dict_get(cbk->dict, EC_XATTR_VERSION); + if ((data != NULL) && (data->len >= sizeof(uint64_t))) { + version = (uint64_t *)data->data; + + if (((ntoh64(version[0]) >> EC_SELFHEAL_BIT) & 1) != 0) { + LOCK(&fop->lock); + + fop->healing |= 1ULL << idx; + + UNLOCK(&fop->lock); + } + } + + ec_dict_del_array(xattr, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE); + link = fop->data; + if (link) { + /*Keep a note of if the dirty is already set or not*/ + link->dirty[0] |= (dirty[0] != 0); + link->dirty[1] |= (dirty[1] != 0); + } + } + + if (xdata) + cbk->xdata = dict_ref(xdata); + + ec_combine(cbk, ec_combine_xattrop); + +out: + if (fop) + ec_complete(fop); + + return 0; +} + +void +ec_wind_xattrop(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_xattrop_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->xattrop, + &fop->loc[0], fop->xattrop_flags, fop->dict, fop->xdata); +} + +int32_t +ec_manager_xattrop(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_LOCK: + if (fop->fd == NULL) { + ec_lock_prepare_inode(fop, &fop->loc[0], EC_UPDATE_META, 0, + EC_RANGE_FULL); + } else { + ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META, 0, + EC_RANGE_FULL); + } + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_false); + if (cbk != NULL) { + int32_t err; + + err = ec_dict_combine(cbk, EC_COMBINE_DICT); + ec_cbk_set_error(cbk, -err, _gf_false); + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->id == GF_FOP_XATTROP) { + if (fop->cbks.xattrop != NULL) { + fop->cbks.xattrop(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, cbk->dict, cbk->xdata); + } + } else { + if (fop->cbks.fxattrop != NULL) { + fop->cbks.fxattrop(fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, cbk->dict, + cbk->xdata); + } + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->id == GF_FOP_XATTROP) { + if (fop->cbks.xattrop != NULL) { + fop->cbks.xattrop(fop->req_frame, fop, fop->xl, -1, + fop->error, NULL, NULL); + } + } else { + if (fop->cbks.fxattrop != NULL) { + fop->cbks.fxattrop(fop->req_frame, fop, fop->xl, -1, + fop->error, NULL, NULL); + } + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_xattrop(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_xattrop_cbk_t func, void *data, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + ec_cbk_t callback = {.xattrop = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(XATTROP) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_XATTROP, 0, target, + fop_flags, ec_wind_xattrop, ec_manager_xattrop, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->xattrop_flags = optype; + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (xattr != NULL) { + fop->dict = dict_ref(xattr); + if (fop->dict == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL); + } +} + +void +ec_wind_fxattrop(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_xattrop_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->fxattrop, + fop->fd, fop->xattrop_flags, fop->dict, fop->xdata); +} + +void +ec_fxattrop(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fxattrop_cbk_t func, void *data, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + ec_cbk_t callback = {.fxattrop = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(FXATTROP) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_FXATTROP, 0, target, + fop_flags, ec_wind_fxattrop, ec_manager_xattrop, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + fop->xattrop_flags = optype; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (xattr != NULL) { + fop->dict = dict_ref(xattr); + if (fop->dict == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL); + } +} + +/* FOP: IPC */ + +int32_t +ec_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_IPC, idx, op_ret, + op_errno); + + if (cbk != NULL) { + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + } + + ec_combine(cbk, NULL); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_ipc(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_ipc_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->ipc, fop->int32, + fop->xdata); +} + +int32_t +ec_manager_ipc(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + ec_fop_prepare_answer(fop, _gf_true); + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + if (fop->cbks.ipc != NULL) { + fop->cbks.ipc(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, cbk->xdata); + } + + return EC_STATE_END; + + case -EC_STATE_INIT: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.ipc != NULL) { + fop->cbks.ipc(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL); + } + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_ipc(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_ipc_cbk_t func, void *data, int32_t op, + dict_t *xdata) +{ + ec_cbk_t callback = {.ipc = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(IPC) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_IPC, 0, target, fop_flags, + ec_wind_ipc, ec_manager_ipc, callback, data); + if (fop == NULL) { + goto out; + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + } + fop->int32 = op; + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL); + } +} diff --git a/xlators/cluster/ec/src/ec-gf8.c b/xlators/cluster/ec/src/ec-gf8.c new file mode 100644 index 00000000000..039adae5929 --- /dev/null +++ b/xlators/cluster/ec/src/ec-gf8.c @@ -0,0 +1,5882 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "ec-gf8.h" + +static ec_gf_op_t ec_gf8_mul_00_ops[] = {{EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_00 = {0, + { + 0, + }, + ec_gf8_mul_00_ops}; + +static ec_gf_op_t ec_gf8_mul_01_ops[] = {{EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_01 = {8, + { + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + }, + ec_gf8_mul_01_ops}; + +static ec_gf_op_t ec_gf8_mul_02_ops[] = {{EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_02 = {8, + { + 7, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + }, + ec_gf8_mul_02_ops}; + +static ec_gf_op_t ec_gf8_mul_03_ops[] = { + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_COPY, 8, 3, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_03 = {9, + { + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + }, + ec_gf8_mul_03_ops}; + +static ec_gf_op_t ec_gf8_mul_04_ops[] = { + {EC_GF_OP_XOR3, 8, 6, 7}, {EC_GF_OP_XOR2, 2, 8, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 1, 8, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_04 = {9, + { + 6, + 7, + 0, + 1, + 2, + 3, + 4, + 5, + 8, + }, + ec_gf8_mul_04_ops}; + +static ec_gf_op_t ec_gf8_mul_05_ops[] = { + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_05 = {8, + { + 0, + 1, + 2, + 6, + 7, + 3, + 4, + 5, + }, + ec_gf8_mul_05_ops}; + +static ec_gf_op_t ec_gf8_mul_06_ops[] = { + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_COPY, 8, 2, 0}, + {EC_GF_OP_XOR2, 8, 3, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 1, 0, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_06 = {9, + { + 7, + 0, + 1, + 2, + 8, + 3, + 4, + 5, + 6, + }, + ec_gf8_mul_06_ops}; + +static ec_gf_op_t ec_gf8_mul_07_ops[] = { + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_07 = {8, + { + 6, + 0, + 1, + 3, + 2, + 4, + 5, + 7, + }, + ec_gf8_mul_07_ops}; + +static ec_gf_op_t ec_gf8_mul_08_ops[] = { + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR3, 8, 6, 7}, + {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 2, 8, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_08 = {9, + { + 5, + 6, + 7, + 0, + 1, + 2, + 3, + 4, + 8, + }, + ec_gf8_mul_08_ops}; + +static ec_gf_op_t ec_gf8_mul_09_ops[] = { + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_09 = {8, + { + 0, + 1, + 2, + 3, + 5, + 6, + 7, + 4, + }, + ec_gf8_mul_09_ops}; + +static ec_gf_op_t ec_gf8_mul_0A_ops[] = { + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_0A = {8, + { + 5, + 0, + 1, + 2, + 6, + 7, + 3, + 4, + }, + ec_gf8_mul_0A_ops}; + +static ec_gf_op_t ec_gf8_mul_0B_ops[] = { + {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_COPY, 9, 3, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_COPY, 8, 5, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 1, 0, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR3, 3, 8, 6}, {EC_GF_OP_XOR2, 1, 9, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_0B = {10, + { + 7, + 1, + 5, + 2, + 4, + 3, + 0, + 6, + 8, + 9, + }, + ec_gf8_mul_0B_ops}; + +static ec_gf_op_t ec_gf8_mul_0C_ops[] = { + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_COPY, 8, 1, 0}, + {EC_GF_OP_XOR2, 8, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 1, 0, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_0C = {9, + { + 5, + 7, + 0, + 1, + 8, + 2, + 3, + 4, + 6, + }, + ec_gf8_mul_0C_ops}; + +static ec_gf_op_t ec_gf8_mul_0D_ops[] = { + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_XOR3, 8, 2, 4}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 1, 8, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR3, 2, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_0D = {9, + { + 5, + 6, + 7, + 3, + 1, + 0, + 2, + 4, + 8, + }, + ec_gf8_mul_0D_ops}; + +static ec_gf_op_t ec_gf8_mul_0E_ops[] = { + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_0E = {8, + { + 7, + 0, + 6, + 1, + 3, + 2, + 4, + 5, + }, + ec_gf8_mul_0E_ops}; + +static ec_gf_op_t ec_gf8_mul_0F_ops[] = { + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_0F = {8, + { + 1, + 0, + 5, + 6, + 7, + 2, + 3, + 4, + }, + ec_gf8_mul_0F_ops}; + +static ec_gf_op_t ec_gf8_mul_10_ops[] = { + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_10 = {8, + { + 4, + 5, + 6, + 7, + 0, + 1, + 2, + 3, + }, + ec_gf8_mul_10_ops}; + +static ec_gf_op_t ec_gf8_mul_11_ops[] = { + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_11 = {8, + { + 4, + 1, + 2, + 6, + 0, + 5, + 7, + 3, + }, + ec_gf8_mul_11_ops}; + +static ec_gf_op_t ec_gf8_mul_12_ops[] = { + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_12 = {8, + { + 7, + 0, + 1, + 2, + 3, + 5, + 6, + 4, + }, + ec_gf8_mul_12_ops}; + +static ec_gf_op_t ec_gf8_mul_13_ops[] = { + {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR3, 8, 3, 7}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 6, 8, 0}, + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 0, 8, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_13 = {9, + { + 4, + 5, + 2, + 6, + 0, + 1, + 7, + 3, + 8, + }, + ec_gf8_mul_13_ops}; + +static ec_gf_op_t ec_gf8_mul_14_ops[] = { + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_14 = {8, + { + 6, + 7, + 0, + 1, + 2, + 4, + 5, + 3, + }, + ec_gf8_mul_14_ops}; + +static ec_gf_op_t ec_gf8_mul_15_ops[] = { + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR3, 5, 8, 7}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_15 = {9, + { + 0, + 1, + 2, + 4, + 7, + 6, + 5, + 3, + 8, + }, + ec_gf8_mul_15_ops}; + +static ec_gf_op_t ec_gf8_mul_16_ops[] = { + {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_16 = {8, + { + 6, + 7, + 4, + 1, + 2, + 3, + 5, + 0, + }, + ec_gf8_mul_16_ops}; + +static ec_gf_op_t ec_gf8_mul_17_ops[] = { + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_17 = {8, + { + 5, + 7, + 0, + 1, + 3, + 2, + 4, + 6, + }, + ec_gf8_mul_17_ops}; + +static ec_gf_op_t ec_gf8_mul_18_ops[] = { + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_18 = {9, + { + 4, + 5, + 7, + 6, + 0, + 1, + 2, + 3, + 8, + }, + ec_gf8_mul_18_ops}; + +static ec_gf_op_t ec_gf8_mul_19_ops[] = { + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_19 = {8, + { + 0, + 5, + 2, + 6, + 7, + 1, + 3, + 4, + }, + ec_gf8_mul_19_ops}; + +static ec_gf_op_t ec_gf8_mul_1A_ops[] = { + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_1A = {8, + { + 7, + 0, + 4, + 5, + 3, + 1, + 2, + 6, + }, + ec_gf8_mul_1A_ops}; + +static ec_gf_op_t ec_gf8_mul_1B_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_1B = {8, + { + 7, + 4, + 5, + 6, + 3, + 1, + 2, + 0, + }, + ec_gf8_mul_1B_ops}; + +static ec_gf_op_t ec_gf8_mul_1C_ops[] = { + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_1C = {8, + { + 5, + 4, + 3, + 0, + 1, + 7, + 2, + 6, + }, + ec_gf8_mul_1C_ops}; + +static ec_gf_op_t ec_gf8_mul_1D_ops[] = { + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR3, 8, 4, 2}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_1D = {9, + { + 0, + 7, + 5, + 8, + 2, + 3, + 4, + 1, + 6, + }, + ec_gf8_mul_1D_ops}; + +static ec_gf_op_t ec_gf8_mul_1E_ops[] = { + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_1E = {8, + { + 4, + 7, + 5, + 1, + 6, + 0, + 2, + 3, + }, + ec_gf8_mul_1E_ops}; + +static ec_gf_op_t ec_gf8_mul_1F_ops[] = { + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR3, 8, 3, 7}, + {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 1, 8, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_1F = {9, + { + 1, + 4, + 5, + 6, + 7, + 0, + 3, + 2, + 8, + }, + ec_gf8_mul_1F_ops}; + +static ec_gf_op_t ec_gf8_mul_20_ops[] = { + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_20 = {8, + { + 7, + 4, + 5, + 6, + 3, + 0, + 1, + 2, + }, + ec_gf8_mul_20_ops}; + +static ec_gf_op_t ec_gf8_mul_21_ops[] = { + {EC_GF_OP_COPY, 9, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR3, 8, 7, 5}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR2, 2, 8, 0}, + {EC_GF_OP_XOR2, 4, 9, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_21 = {10, + { + 0, + 1, + 2, + 7, + 5, + 4, + 3, + 6, + 8, + 9, + }, + ec_gf8_mul_21_ops}; + +static ec_gf_op_t ec_gf8_mul_22_ops[] = { + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_22 = {8, + { + 3, + 0, + 5, + 2, + 6, + 4, + 1, + 7, + }, + ec_gf8_mul_22_ops}; + +static ec_gf_op_t ec_gf8_mul_23_ops[] = { + {EC_GF_OP_COPY, 8, 2, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 3, 8, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_23 = {9, + { + 0, + 4, + 3, + 2, + 5, + 6, + 1, + 8, + 7, + }, + ec_gf8_mul_23_ops}; + +static ec_gf_op_t ec_gf8_mul_24_ops[] = { + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_24 = {8, + { + 6, + 7, + 0, + 1, + 2, + 4, + 5, + 3, + }, + ec_gf8_mul_24_ops}; + +static ec_gf_op_t ec_gf8_mul_25_ops[] = { + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_25 = {8, + { + 2, + 7, + 0, + 1, + 3, + 4, + 5, + 6, + }, + ec_gf8_mul_25_ops}; + +static ec_gf_op_t ec_gf8_mul_26_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_26 = {8, + { + 3, + 4, + 1, + 2, + 0, + 5, + 6, + 7, + }, + ec_gf8_mul_26_ops}; + +static ec_gf_op_t ec_gf8_mul_27_ops[] = { + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_27 = {8, + { + 3, + 0, + 1, + 2, + 6, + 7, + 4, + 5, + }, + ec_gf8_mul_27_ops}; + +static ec_gf_op_t ec_gf8_mul_28_ops[] = { + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_28 = {8, + { + 5, + 6, + 3, + 0, + 1, + 2, + 4, + 7, + }, + ec_gf8_mul_28_ops}; + +static ec_gf_op_t ec_gf8_mul_29_ops[] = { + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_29 = {8, + { + 4, + 6, + 3, + 5, + 7, + 0, + 1, + 2, + }, + ec_gf8_mul_29_ops}; + +static ec_gf_op_t ec_gf8_mul_2A_ops[] = { + {EC_GF_OP_COPY, 8, 1, 0}, {EC_GF_OP_XOR2, 8, 0, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR3, 6, 8, 4}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_2A = {9, + { + 3, + 4, + 7, + 2, + 6, + 5, + 1, + 0, + 8, + }, + ec_gf8_mul_2A_ops}; + +static ec_gf_op_t ec_gf8_mul_2B_ops[] = { + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_2B = {8, + { + 3, + 4, + 7, + 5, + 6, + 0, + 1, + 2, + }, + ec_gf8_mul_2B_ops}; + +static ec_gf_op_t ec_gf8_mul_2C_ops[] = { + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_2C = {8, + { + 5, + 6, + 7, + 0, + 2, + 3, + 4, + 1, + }, + ec_gf8_mul_2C_ops}; + +static ec_gf_op_t ec_gf8_mul_2D_ops[] = { + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR3, 8, 4, 6}, + {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 7, 8, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_2D = {9, + { + 7, + 0, + 3, + 5, + 1, + 4, + 2, + 6, + 8, + }, + ec_gf8_mul_2D_ops}; + +static ec_gf_op_t ec_gf8_mul_2E_ops[] = { + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_COPY, 8, 4, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 8, 7, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 2, 8, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_2E = {9, + { + 5, + 0, + 7, + 3, + 2, + 6, + 4, + 1, + 8, + }, + ec_gf8_mul_2E_ops}; + +static ec_gf_op_t ec_gf8_mul_2F_ops[] = { + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR3, 8, 7, 6}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 3, 8, 0}, + {EC_GF_OP_XOR2, 2, 8, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_2F = {9, + { + 6, + 3, + 2, + 5, + 7, + 0, + 1, + 4, + 8, + }, + ec_gf8_mul_2F_ops}; + +static ec_gf_op_t ec_gf8_mul_30_ops[] = { + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 8, 1, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR3, 6, 8, 7}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_30 = {9, + { + 3, + 4, + 7, + 5, + 0, + 6, + 1, + 2, + 8, + }, + ec_gf8_mul_30_ops}; + +static ec_gf_op_t ec_gf8_mul_31_ops[] = { + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_31 = {8, + { + 7, + 1, + 4, + 5, + 6, + 0, + 2, + 3, + }, + ec_gf8_mul_31_ops}; + +static ec_gf_op_t ec_gf8_mul_32_ops[] = { + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_32 = {8, + { + 3, + 4, + 6, + 7, + 5, + 0, + 1, + 2, + }, + ec_gf8_mul_32_ops}; + +static ec_gf_op_t ec_gf8_mul_33_ops[] = { + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_33 = {8, + { + 5, + 4, + 3, + 0, + 2, + 1, + 6, + 7, + }, + ec_gf8_mul_33_ops}; + +static ec_gf_op_t ec_gf8_mul_34_ops[] = { + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_34 = {8, + { + 7, + 5, + 3, + 0, + 2, + 4, + 1, + 6, + }, + ec_gf8_mul_34_ops}; + +static ec_gf_op_t ec_gf8_mul_35_ops[] = { + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_35 = {8, + { + 6, + 7, + 5, + 4, + 2, + 0, + 1, + 3, + }, + ec_gf8_mul_35_ops}; + +static ec_gf_op_t ec_gf8_mul_36_ops[] = { + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_36 = {8, + { + 6, + 7, + 4, + 1, + 2, + 3, + 0, + 5, + }, + ec_gf8_mul_36_ops}; + +static ec_gf_op_t ec_gf8_mul_37_ops[] = { + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR3, 8, 0, 1}, + {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR2, 7, 8, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_37 = {9, + { + 6, + 7, + 2, + 1, + 0, + 3, + 4, + 5, + 8, + }, + ec_gf8_mul_37_ops}; + +static ec_gf_op_t ec_gf8_mul_38_ops[] = { + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR3, 8, 6, 7}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 4, 8, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_38 = {9, + { + 4, + 5, + 6, + 3, + 0, + 1, + 7, + 2, + 8, + }, + ec_gf8_mul_38_ops}; + +static ec_gf_op_t ec_gf8_mul_39_ops[] = { + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_39 = {8, + { + 1, + 6, + 3, + 0, + 5, + 2, + 4, + 7, + }, + ec_gf8_mul_39_ops}; + +static ec_gf_op_t ec_gf8_mul_3A_ops[] = { + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_3A = {8, + { + 3, + 4, + 7, + 0, + 5, + 6, + 1, + 2, + }, + ec_gf8_mul_3A_ops}; + +static ec_gf_op_t ec_gf8_mul_3B_ops[] = { + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR3, 8, 7, 3}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_3B = {9, + { + 3, + 0, + 1, + 7, + 6, + 2, + 4, + 8, + 5, + }, + ec_gf8_mul_3B_ops}; + +static ec_gf_op_t ec_gf8_mul_3C_ops[] = { + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_3C = {8, + { + 3, + 6, + 4, + 1, + 7, + 2, + 0, + 5, + }, + ec_gf8_mul_3C_ops}; + +static ec_gf_op_t ec_gf8_mul_3D_ops[] = { + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 1, 0, 0}, + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_3D = {8, + { + 2, + 3, + 4, + 5, + 6, + 7, + 0, + 1, + }, + ec_gf8_mul_3D_ops}; + +static ec_gf_op_t ec_gf8_mul_3E_ops[] = { + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_3E = {8, + { + 6, + 1, + 2, + 7, + 0, + 3, + 5, + 4, + }, + ec_gf8_mul_3E_ops}; + +static ec_gf_op_t ec_gf8_mul_3F_ops[] = { + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_COPY, 10, 4, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_COPY, 9, 2, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR3, 4, 9, 7}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 3, 10, 0}, {EC_GF_OP_XOR2, 5, 8, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_3F = {11, + { + 1, + 7, + 6, + 2, + 4, + 3, + 5, + 0, + 8, + 9, + 10, + }, + ec_gf8_mul_3F_ops}; + +static ec_gf_op_t ec_gf8_mul_40_ops[] = { + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR3, 8, 7, 6}, + {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 5, 8, 0}, + {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_40 = {9, + { + 5, + 7, + 4, + 6, + 2, + 3, + 0, + 1, + 8, + }, + ec_gf8_mul_40_ops}; + +static ec_gf_op_t ec_gf8_mul_41_ops[] = { + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 8, 4, 0}, + {EC_GF_OP_XOR2, 8, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_41 = {9, + { + 0, + 7, + 6, + 5, + 3, + 4, + 8, + 1, + 2, + }, + ec_gf8_mul_41_ops}; + +static ec_gf_op_t ec_gf8_mul_42_ops[] = { + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 8, 3, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_42 = {9, + { + 2, + 7, + 1, + 6, + 4, + 3, + 0, + 5, + 8, + }, + ec_gf8_mul_42_ops}; + +static ec_gf_op_t ec_gf8_mul_43_ops[] = { + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_43 = {8, + { + 2, + 6, + 4, + 1, + 7, + 3, + 0, + 5, + }, + ec_gf8_mul_43_ops}; + +static ec_gf_op_t ec_gf8_mul_44_ops[] = { + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_44 = {8, + { + 2, + 3, + 4, + 1, + 6, + 5, + 0, + 7, + }, + ec_gf8_mul_44_ops}; + +static ec_gf_op_t ec_gf8_mul_45_ops[] = { + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_45 = {8, + { + 2, + 3, + 0, + 1, + 7, + 4, + 5, + 6, + }, + ec_gf8_mul_45_ops}; + +static ec_gf_op_t ec_gf8_mul_46_ops[] = { + {EC_GF_OP_XOR3, 8, 2, 4}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 8, 0, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 8, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_46 = {9, + { + 2, + 0, + 1, + 3, + 4, + 5, + 6, + 7, + 8, + }, + ec_gf8_mul_46_ops}; + +static ec_gf_op_t ec_gf8_mul_47_ops[] = { + {EC_GF_OP_XOR3, 8, 0, 1}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_47 = {9, + { + 2, + 3, + 4, + 5, + 6, + 7, + 0, + 1, + 8, + }, + ec_gf8_mul_47_ops}; + +static ec_gf_op_t ec_gf8_mul_48_ops[] = { + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_48 = {8, + { + 4, + 5, + 6, + 0, + 1, + 3, + 7, + 2, + }, + ec_gf8_mul_48_ops}; + +static ec_gf_op_t ec_gf8_mul_49_ops[] = { + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR3, 8, 0, 6}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 7, 8, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR3, 1, 8, 5}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_49 = {9, + { + 7, + 2, + 4, + 0, + 3, + 5, + 1, + 6, + 8, + }, + ec_gf8_mul_49_ops}; + +static ec_gf_op_t ec_gf8_mul_4A_ops[] = { + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_4A = {8, + { + 5, + 6, + 7, + 0, + 1, + 3, + 4, + 2, + }, + ec_gf8_mul_4A_ops}; + +static ec_gf_op_t ec_gf8_mul_4B_ops[] = { + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR3, 8, 3, 7}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 8, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 5, 8, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_4B = {9, + { + 5, + 3, + 6, + 7, + 0, + 2, + 4, + 1, + 8, + }, + ec_gf8_mul_4B_ops}; + +static ec_gf_op_t ec_gf8_mul_4C_ops[] = { + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_4C = {8, + { + 5, + 3, + 4, + 7, + 0, + 6, + 2, + 1, + }, + ec_gf8_mul_4C_ops}; + +static ec_gf_op_t ec_gf8_mul_4D_ops[] = { + {EC_GF_OP_COPY, 8, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR3, 9, 3, 1}, + {EC_GF_OP_XOR2, 5, 9, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR3, 0, 8, 2}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_4D = {10, + { + 0, + 9, + 3, + 5, + 6, + 4, + 7, + 1, + 2, + 8, + }, + ec_gf8_mul_4D_ops}; + +static ec_gf_op_t ec_gf8_mul_4E_ops[] = { + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_4E = {8, + { + 2, + 3, + 0, + 1, + 5, + 6, + 7, + 4, + }, + ec_gf8_mul_4E_ops}; + +static ec_gf_op_t ec_gf8_mul_4F_ops[] = { + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_4F = {8, + { + 0, + 3, + 5, + 6, + 1, + 2, + 7, + 4, + }, + ec_gf8_mul_4F_ops}; + +static ec_gf_op_t ec_gf8_mul_50_ops[] = { + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_50 = {8, + { + 4, + 5, + 7, + 3, + 0, + 1, + 2, + 6, + }, + ec_gf8_mul_50_ops}; + +static ec_gf_op_t ec_gf8_mul_51_ops[] = { + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_51 = {8, + { + 0, + 1, + 7, + 2, + 3, + 4, + 5, + 6, + }, + ec_gf8_mul_51_ops}; + +static ec_gf_op_t ec_gf8_mul_52_ops[] = { + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_COPY, 9, 4, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR3, 3, 5, 8}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 2, 9, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_52 = {10, + { + 2, + 3, + 1, + 4, + 6, + 7, + 0, + 5, + 8, + 9, + }, + ec_gf8_mul_52_ops}; + +static ec_gf_op_t ec_gf8_mul_53_ops[] = { + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_53 = {8, + { + 2, + 0, + 1, + 4, + 5, + 6, + 7, + 3, + }, + ec_gf8_mul_53_ops}; + +static ec_gf_op_t ec_gf8_mul_54_ops[] = { + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_54 = {8, + { + 7, + 3, + 0, + 4, + 2, + 6, + 5, + 1, + }, + ec_gf8_mul_54_ops}; + +static ec_gf_op_t ec_gf8_mul_55_ops[] = { + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_55 = {8, + { + 1, + 5, + 6, + 4, + 3, + 7, + 2, + 0, + }, + ec_gf8_mul_55_ops}; + +static ec_gf_op_t ec_gf8_mul_56_ops[] = { + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_56 = {8, + { + 2, + 3, + 0, + 4, + 5, + 6, + 7, + 1, + }, + ec_gf8_mul_56_ops}; + +static ec_gf_op_t ec_gf8_mul_57_ops[] = { + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_57 = {8, + { + 2, + 3, + 0, + 1, + 4, + 5, + 6, + 7, + }, + ec_gf8_mul_57_ops}; + +static ec_gf_op_t ec_gf8_mul_58_ops[] = { + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_58 = {8, + { + 4, + 3, + 2, + 7, + 0, + 1, + 5, + 6, + }, + ec_gf8_mul_58_ops}; + +static ec_gf_op_t ec_gf8_mul_59_ops[] = { + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_59 = {8, + { + 7, + 3, + 5, + 6, + 1, + 2, + 0, + 4, + }, + ec_gf8_mul_59_ops}; + +static ec_gf_op_t ec_gf8_mul_5A_ops[] = { + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_5A = {8, + { + 6, + 7, + 0, + 1, + 2, + 3, + 5, + 4, + }, + ec_gf8_mul_5A_ops}; + +static ec_gf_op_t ec_gf8_mul_5B_ops[] = { + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_5B = {8, + { + 6, + 0, + 7, + 5, + 2, + 1, + 3, + 4, + }, + ec_gf8_mul_5B_ops}; + +static ec_gf_op_t ec_gf8_mul_5C_ops[] = { + {EC_GF_OP_COPY, 8, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_XOR2, 2, 8, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_5C = {9, + { + 7, + 5, + 2, + 4, + 1, + 0, + 6, + 3, + 8, + }, + ec_gf8_mul_5C_ops}; + +static ec_gf_op_t ec_gf8_mul_5D_ops[] = { + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_5D = {8, + { + 1, + 3, + 5, + 4, + 6, + 7, + 2, + 0, + }, + ec_gf8_mul_5D_ops}; + +static ec_gf_op_t ec_gf8_mul_5E_ops[] = { + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_5E = {8, + { + 4, + 3, + 6, + 2, + 5, + 7, + 0, + 1, + }, + ec_gf8_mul_5E_ops}; + +static ec_gf_op_t ec_gf8_mul_5F_ops[] = { + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 0, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_5F = {8, + { + 6, + 1, + 3, + 4, + 5, + 7, + 2, + 0, + }, + ec_gf8_mul_5F_ops}; + +static ec_gf_op_t ec_gf8_mul_60_ops[] = { + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_60 = {8, + { + 2, + 3, + 4, + 7, + 5, + 6, + 0, + 1, + }, + ec_gf8_mul_60_ops}; + +static ec_gf_op_t ec_gf8_mul_61_ops[] = { + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_61 = {8, + { + 0, + 5, + 6, + 7, + 4, + 2, + 1, + 3, + }, + ec_gf8_mul_61_ops}; + +static ec_gf_op_t ec_gf8_mul_62_ops[] = { + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_62 = {8, + { + 2, + 0, + 3, + 4, + 5, + 6, + 7, + 1, + }, + ec_gf8_mul_62_ops}; + +static ec_gf_op_t ec_gf8_mul_63_ops[] = { + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_63 = {8, + { + 3, + 4, + 6, + 5, + 7, + 0, + 1, + 2, + }, + ec_gf8_mul_63_ops}; + +static ec_gf_op_t ec_gf8_mul_64_ops[] = { + {EC_GF_OP_COPY, 8, 1, 0}, {EC_GF_OP_XOR2, 8, 0, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 8, 7, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_64 = {9, + { + 2, + 3, + 4, + 6, + 5, + 7, + 8, + 1, + 0, + }, + ec_gf8_mul_64_ops}; + +static ec_gf_op_t ec_gf8_mul_65_ops[] = { + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_65 = {8, + { + 2, + 5, + 1, + 3, + 4, + 0, + 6, + 7, + }, + ec_gf8_mul_65_ops}; + +static ec_gf_op_t ec_gf8_mul_66_ops[] = { + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_66 = {8, + { + 2, + 3, + 1, + 4, + 5, + 7, + 0, + 6, + }, + ec_gf8_mul_66_ops}; + +static ec_gf_op_t ec_gf8_mul_67_ops[] = { + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_67 = {8, + { + 2, + 4, + 5, + 6, + 7, + 3, + 1, + 0, + }, + ec_gf8_mul_67_ops}; + +static ec_gf_op_t ec_gf8_mul_68_ops[] = { + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_68 = {8, + { + 5, + 7, + 2, + 3, + 0, + 6, + 4, + 1, + }, + ec_gf8_mul_68_ops}; + +static ec_gf_op_t ec_gf8_mul_69_ops[] = { + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_69 = {8, + { + 0, + 1, + 3, + 2, + 4, + 5, + 7, + 6, + }, + ec_gf8_mul_69_ops}; + +static ec_gf_op_t ec_gf8_mul_6A_ops[] = { + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_6A = {8, + { + 5, + 7, + 4, + 6, + 1, + 2, + 0, + 3, + }, + ec_gf8_mul_6A_ops}; + +static ec_gf_op_t ec_gf8_mul_6B_ops[] = { + {EC_GF_OP_COPY, 8, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_6B = {9, + { + 6, + 7, + 2, + 0, + 3, + 1, + 5, + 4, + 8, + }, + ec_gf8_mul_6B_ops}; + +static ec_gf_op_t ec_gf8_mul_6C_ops[] = { + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_6C = {8, + { + 5, + 6, + 7, + 0, + 1, + 2, + 3, + 4, + }, + ec_gf8_mul_6C_ops}; + +static ec_gf_op_t ec_gf8_mul_6D_ops[] = { + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR3, 8, 3, 4}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_6D = {9, + { + 3, + 6, + 7, + 0, + 4, + 5, + 1, + 2, + 8, + }, + ec_gf8_mul_6D_ops}; + +static ec_gf_op_t ec_gf8_mul_6E_ops[] = { + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_6E = {8, + { + 5, + 6, + 3, + 1, + 7, + 2, + 0, + 4, + }, + ec_gf8_mul_6E_ops}; + +static ec_gf_op_t ec_gf8_mul_6F_ops[] = { + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR3, 0, 8, 7}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_6F = {9, + { + 2, + 6, + 3, + 7, + 0, + 1, + 4, + 5, + 8, + }, + ec_gf8_mul_6F_ops}; + +static ec_gf_op_t ec_gf8_mul_70_ops[] = { + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_70 = {8, + { + 3, + 4, + 5, + 2, + 6, + 0, + 1, + 7, + }, + ec_gf8_mul_70_ops}; + +static ec_gf_op_t ec_gf8_mul_71_ops[] = { + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_71 = {8, + { + 4, + 7, + 5, + 3, + 6, + 0, + 2, + 1, + }, + ec_gf8_mul_71_ops}; + +static ec_gf_op_t ec_gf8_mul_72_ops[] = { + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_72 = {8, + { + 0, + 5, + 2, + 7, + 4, + 1, + 3, + 6, + }, + ec_gf8_mul_72_ops}; + +static ec_gf_op_t ec_gf8_mul_73_ops[] = { + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_73 = {8, + { + 6, + 0, + 1, + 7, + 4, + 5, + 2, + 3, + }, + ec_gf8_mul_73_ops}; + +static ec_gf_op_t ec_gf8_mul_74_ops[] = { + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_74 = {8, + { + 3, + 2, + 1, + 0, + 4, + 5, + 6, + 7, + }, + ec_gf8_mul_74_ops}; + +static ec_gf_op_t ec_gf8_mul_75_ops[] = { + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_75 = {8, + { + 4, + 5, + 6, + 7, + 0, + 1, + 2, + 3, + }, + ec_gf8_mul_75_ops}; + +static ec_gf_op_t ec_gf8_mul_76_ops[] = { + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR3, 8, 6, 2}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 0, 8, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_76 = {9, + { + 2, + 3, + 0, + 6, + 5, + 1, + 7, + 8, + 4, + }, + ec_gf8_mul_76_ops}; + +static ec_gf_op_t ec_gf8_mul_77_ops[] = { + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_77 = {8, + { + 7, + 4, + 3, + 6, + 0, + 1, + 5, + 2, + }, + ec_gf8_mul_77_ops}; + +static ec_gf_op_t ec_gf8_mul_78_ops[] = { + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR3, 8, 0, 2}, + {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 1, 8, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_78 = {9, + { + 4, + 7, + 3, + 2, + 5, + 1, + 6, + 0, + 8, + }, + ec_gf8_mul_78_ops}; + +static ec_gf_op_t ec_gf8_mul_79_ops[] = { + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR3, 8, 4, 7}, + {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_79 = {9, + { + 4, + 5, + 7, + 3, + 1, + 6, + 2, + 0, + 8, + }, + ec_gf8_mul_79_ops}; + +static ec_gf_op_t ec_gf8_mul_7A_ops[] = { + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 1, 0, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_7A = {8, + { + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 0, + }, + ec_gf8_mul_7A_ops}; + +static ec_gf_op_t ec_gf8_mul_7B_ops[] = { + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR3, 8, 5, 3}, + {EC_GF_OP_XOR2, 8, 0, 0}, {EC_GF_OP_COPY, 9, 4, 0}, + {EC_GF_OP_XOR2, 8, 2, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR3, 4, 1, 9}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_7B = {10, + { + 1, + 2, + 3, + 4, + 8, + 5, + 6, + 0, + 7, + 9, + }, + ec_gf8_mul_7B_ops}; + +static ec_gf_op_t ec_gf8_mul_7C_ops[] = { + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_7C = {8, + { + 2, + 4, + 1, + 6, + 3, + 5, + 7, + 0, + }, + ec_gf8_mul_7C_ops}; + +static ec_gf_op_t ec_gf8_mul_7D_ops[] = { + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 0, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_7D = {8, + { + 1, + 0, + 3, + 5, + 6, + 7, + 2, + 4, + }, + ec_gf8_mul_7D_ops}; + +static ec_gf_op_t ec_gf8_mul_7E_ops[] = { + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_COPY, 8, 0, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR3, 6, 2, 7}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_7E = {9, + { + 5, + 1, + 2, + 0, + 7, + 3, + 4, + 6, + 8, + }, + ec_gf8_mul_7E_ops}; + +static ec_gf_op_t ec_gf8_mul_7F_ops[] = { + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR3, 9, 7, 5}, {EC_GF_OP_XOR2, 2, 9, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 6, 9, 0}, + {EC_GF_OP_XOR3, 9, 6, 4}, {EC_GF_OP_XOR2, 7, 9, 0}, + {EC_GF_OP_XOR2, 3, 9, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_7F = {10, + { + 4, + 1, + 0, + 5, + 6, + 7, + 2, + 3, + 8, + 9, + }, + ec_gf8_mul_7F_ops}; + +static ec_gf_op_t ec_gf8_mul_80_ops[] = { + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_80 = {8, + { + 7, + 5, + 6, + 4, + 1, + 2, + 3, + 0, + }, + ec_gf8_mul_80_ops}; + +static ec_gf_op_t ec_gf8_mul_81_ops[] = { + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_81 = {8, + { + 2, + 7, + 4, + 1, + 5, + 6, + 3, + 0, + }, + ec_gf8_mul_81_ops}; + +static ec_gf_op_t ec_gf8_mul_82_ops[] = { + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_COPY, 8, 6, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR3, 5, 8, 7}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_82 = {9, + { + 6, + 2, + 7, + 5, + 1, + 3, + 4, + 0, + 8, + }, + ec_gf8_mul_82_ops}; + +static ec_gf_op_t ec_gf8_mul_83_ops[] = { + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_83 = {8, + { + 3, + 5, + 6, + 7, + 1, + 2, + 4, + 0, + }, + ec_gf8_mul_83_ops}; + +static ec_gf_op_t ec_gf8_mul_84_ops[] = { + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_84 = {8, + { + 7, + 6, + 0, + 4, + 1, + 5, + 3, + 2, + }, + ec_gf8_mul_84_ops}; + +static ec_gf_op_t ec_gf8_mul_85_ops[] = { + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_85 = {8, + { + 7, + 6, + 0, + 3, + 2, + 4, + 5, + 1, + }, + ec_gf8_mul_85_ops}; + +static ec_gf_op_t ec_gf8_mul_86_ops[] = { + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_86 = {8, + { + 1, + 2, + 6, + 4, + 5, + 7, + 3, + 0, + }, + ec_gf8_mul_86_ops}; + +static ec_gf_op_t ec_gf8_mul_87_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_COPY, 8, 1, 0}, + {EC_GF_OP_XOR2, 8, 6, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR3, 5, 8, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 8, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_87 = {9, + { + 1, + 2, + 3, + 4, + 5, + 7, + 6, + 0, + 8, + }, + ec_gf8_mul_87_ops}; + +static ec_gf_op_t ec_gf8_mul_88_ops[] = { + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 1, 0, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_88 = {8, + { + 6, + 7, + 3, + 1, + 2, + 4, + 5, + 0, + }, + ec_gf8_mul_88_ops}; + +static ec_gf_op_t ec_gf8_mul_89_ops[] = { + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR3, 8, 5, 2}, + {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_89 = {9, + { + 2, + 1, + 6, + 5, + 7, + 3, + 4, + 0, + 8, + }, + ec_gf8_mul_89_ops}; + +static ec_gf_op_t ec_gf8_mul_8A_ops[] = { + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_8A = {8, + { + 1, + 2, + 3, + 0, + 6, + 7, + 4, + 5, + }, + ec_gf8_mul_8A_ops}; + +static ec_gf_op_t ec_gf8_mul_8B_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_8B = {8, + { + 6, + 1, + 2, + 3, + 5, + 7, + 4, + 0, + }, + ec_gf8_mul_8B_ops}; + +static ec_gf_op_t ec_gf8_mul_8C_ops[] = { + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_8C = {8, + { + 1, + 2, + 0, + 7, + 3, + 4, + 5, + 6, + }, + ec_gf8_mul_8C_ops}; + +static ec_gf_op_t ec_gf8_mul_8D_ops[] = { + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_8D = {8, + { + 7, + 1, + 3, + 2, + 4, + 5, + 0, + 6, + }, + ec_gf8_mul_8D_ops}; + +static ec_gf_op_t ec_gf8_mul_8E_ops[] = {{EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_8E = {8, + { + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 0, + }, + ec_gf8_mul_8E_ops}; + +static ec_gf_op_t ec_gf8_mul_8F_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_8F = {8, + { + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 0, + }, + ec_gf8_mul_8F_ops}; + +static ec_gf_op_t ec_gf8_mul_90_ops[] = { + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_90 = {8, + { + 4, + 5, + 6, + 7, + 0, + 1, + 3, + 2, + }, + ec_gf8_mul_90_ops}; + +static ec_gf_op_t ec_gf8_mul_91_ops[] = { + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_COPY, 9, 1, 0}, {EC_GF_OP_COPY, 8, 3, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 7, 9, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR3, 5, 8, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_91 = {10, + { + 2, + 3, + 1, + 4, + 0, + 6, + 7, + 5, + 8, + 9, + }, + ec_gf8_mul_91_ops}; + +static ec_gf_op_t ec_gf8_mul_92_ops[] = { + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_92 = {8, + { + 6, + 7, + 0, + 1, + 2, + 3, + 5, + 4, + }, + ec_gf8_mul_92_ops}; + +static ec_gf_op_t ec_gf8_mul_93_ops[] = { + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_93 = {8, + { + 6, + 4, + 5, + 1, + 7, + 2, + 3, + 0, + }, + ec_gf8_mul_93_ops}; + +static ec_gf_op_t ec_gf8_mul_94_ops[] = { + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_94 = {8, + { + 7, + 5, + 0, + 2, + 6, + 1, + 3, + 4, + }, + ec_gf8_mul_94_ops}; + +static ec_gf_op_t ec_gf8_mul_95_ops[] = { + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_95 = {8, + { + 7, + 6, + 1, + 3, + 0, + 4, + 5, + 2, + }, + ec_gf8_mul_95_ops}; + +static ec_gf_op_t ec_gf8_mul_96_ops[] = { + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR3, 8, 0, 4}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 8, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_96 = {9, + { + 4, + 0, + 1, + 6, + 7, + 2, + 3, + 5, + 8, + }, + ec_gf8_mul_96_ops}; + +static ec_gf_op_t ec_gf8_mul_97_ops[] = { + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_COPY, 8, 2, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 8, 6, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 5, 8, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_97 = {9, + { + 4, + 5, + 3, + 6, + 7, + 1, + 2, + 0, + 8, + }, + ec_gf8_mul_97_ops}; + +static ec_gf_op_t ec_gf8_mul_98_ops[] = { + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_98 = {8, + { + 4, + 2, + 3, + 6, + 7, + 5, + 1, + 0, + }, + ec_gf8_mul_98_ops}; + +static ec_gf_op_t ec_gf8_mul_99_ops[] = { + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_99 = {8, + { + 6, + 5, + 3, + 7, + 0, + 1, + 4, + 2, + }, + ec_gf8_mul_99_ops}; + +static ec_gf_op_t ec_gf8_mul_9A_ops[] = { + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR3, 8, 4, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR2, 1, 8, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_9A = {9, + { + 6, + 3, + 4, + 0, + 5, + 1, + 2, + 7, + 8, + }, + ec_gf8_mul_9A_ops}; + +static ec_gf_op_t ec_gf8_mul_9B_ops[] = { + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_COPY, 9, 5, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR3, 8, 3, 2}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 3, 9, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_9B = {10, + { + 4, + 5, + 8, + 6, + 7, + 1, + 2, + 0, + 3, + 9, + }, + ec_gf8_mul_9B_ops}; + +static ec_gf_op_t ec_gf8_mul_9C_ops[] = { + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_9C = {8, + { + 3, + 2, + 1, + 0, + 4, + 5, + 6, + 7, + }, + ec_gf8_mul_9C_ops}; + +static ec_gf_op_t ec_gf8_mul_9D_ops[] = { + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_9D = {8, + { + 0, + 1, + 2, + 3, + 7, + 4, + 5, + 6, + }, + ec_gf8_mul_9D_ops}; + +static ec_gf_op_t ec_gf8_mul_9E_ops[] = { + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_COPY, 8, 7, 0}, + {EC_GF_OP_XOR2, 8, 5, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_9E = {9, + { + 4, + 5, + 3, + 8, + 6, + 0, + 2, + 7, + 1, + }, + ec_gf8_mul_9E_ops}; + +static ec_gf_op_t ec_gf8_mul_9F_ops[] = { + {EC_GF_OP_XOR3, 8, 1, 2}, {EC_GF_OP_XOR2, 8, 3, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_XOR2, 5, 8, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_9F = {9, + { + 4, + 5, + 6, + 7, + 0, + 1, + 2, + 3, + 8, + }, + ec_gf8_mul_9F_ops}; + +static ec_gf_op_t ec_gf8_mul_A0_ops[] = { + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_A0 = {8, + { + 3, + 1, + 6, + 7, + 5, + 2, + 4, + 0, + }, + ec_gf8_mul_A0_ops}; + +static ec_gf_op_t ec_gf8_mul_A1_ops[] = { + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR3, 8, 0, 6}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 7, 8, 0}, + {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_A1 = {9, + { + 7, + 4, + 1, + 5, + 6, + 0, + 2, + 3, + 8, + }, + ec_gf8_mul_A1_ops}; + +static ec_gf_op_t ec_gf8_mul_A2_ops[] = { + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_A2 = {8, + { + 7, + 0, + 6, + 3, + 2, + 1, + 4, + 5, + }, + ec_gf8_mul_A2_ops}; + +static ec_gf_op_t ec_gf8_mul_A3_ops[] = { + {EC_GF_OP_COPY, 8, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 3, 8, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_A3 = {9, + { + 3, + 7, + 2, + 6, + 1, + 4, + 0, + 5, + 8, + }, + ec_gf8_mul_A3_ops}; + +static ec_gf_op_t ec_gf8_mul_A4_ops[] = { + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_A4 = {8, + { + 5, + 6, + 7, + 2, + 4, + 3, + 0, + 1, + }, + ec_gf8_mul_A4_ops}; + +static ec_gf_op_t ec_gf8_mul_A5_ops[] = { + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR3, 8, 5, 6}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 7, 8, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_A5 = {9, + { + 1, + 4, + 2, + 5, + 6, + 7, + 3, + 0, + 8, + }, + ec_gf8_mul_A5_ops}; + +static ec_gf_op_t ec_gf8_mul_A6_ops[] = { + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_A6 = {8, + { + 1, + 2, + 0, + 3, + 4, + 5, + 6, + 7, + }, + ec_gf8_mul_A6_ops}; + +static ec_gf_op_t ec_gf8_mul_A7_ops[] = { + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_A7 = {8, + { + 0, + 1, + 2, + 5, + 6, + 7, + 3, + 4, + }, + ec_gf8_mul_A7_ops}; + +static ec_gf_op_t ec_gf8_mul_A8_ops[] = { + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 8, 1, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_COPY, 9, 4, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 8, 3, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 2, 9, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_A8 = {10, + { + 1, + 7, + 5, + 8, + 6, + 3, + 4, + 0, + 2, + 9, + }, + ec_gf8_mul_A8_ops}; + +static ec_gf_op_t ec_gf8_mul_A9_ops[] = { + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0}, + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_A9 = {8, + { + 3, + 7, + 6, + 1, + 2, + 0, + 4, + 5, + }, + ec_gf8_mul_A9_ops}; + +static ec_gf_op_t ec_gf8_mul_AA_ops[] = { + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_AA = {8, + { + 0, + 4, + 5, + 3, + 6, + 7, + 1, + 2, + }, + ec_gf8_mul_AA_ops}; + +static ec_gf_op_t ec_gf8_mul_AB_ops[] = { + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_COPY, 9, 6, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_XOR2, 8, 7, 0}, {EC_GF_OP_XOR2, 3, 8, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR3, 3, 9, 7}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_AB = {10, + { + 2, + 3, + 8, + 0, + 5, + 6, + 1, + 4, + 7, + 9, + }, + ec_gf8_mul_AB_ops}; + +static ec_gf_op_t ec_gf8_mul_AC_ops[] = { + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_AC = {8, + { + 3, + 2, + 1, + 0, + 4, + 5, + 6, + 7, + }, + ec_gf8_mul_AC_ops}; + +static ec_gf_op_t ec_gf8_mul_AD_ops[] = { + {EC_GF_OP_XOR3, 8, 1, 2}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 8, 0}, + {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_AD = {9, + { + 3, + 4, + 5, + 6, + 7, + 0, + 1, + 2, + 8, + }, + ec_gf8_mul_AD_ops}; + +static ec_gf_op_t ec_gf8_mul_AE_ops[] = { + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_COPY, 8, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 8, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_AE = {9, + { + 7, + 0, + 5, + 6, + 3, + 4, + 1, + 2, + 8, + }, + ec_gf8_mul_AE_ops}; + +static ec_gf_op_t ec_gf8_mul_AF_ops[] = { + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_AF = {8, + { + 0, + 1, + 2, + 7, + 3, + 4, + 5, + 6, + }, + ec_gf8_mul_AF_ops}; + +static ec_gf_op_t ec_gf8_mul_B0_ops[] = { + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 1, 0, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_B0 = {8, + { + 4, + 0, + 7, + 2, + 3, + 1, + 6, + 5, + }, + ec_gf8_mul_B0_ops}; + +static ec_gf_op_t ec_gf8_mul_B1_ops[] = { + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_COPY, 8, 4, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR3, 5, 8, 1}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_B1 = {9, + { + 2, + 6, + 4, + 7, + 0, + 1, + 3, + 5, + 8, + }, + ec_gf8_mul_B1_ops}; + +static ec_gf_op_t ec_gf8_mul_B2_ops[] = { + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR3, 8, 4, 5}, + {EC_GF_OP_XOR2, 2, 8, 0}, {EC_GF_OP_XOR2, 8, 1, 0}, + {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR2, 3, 8, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_B2 = {9, + { + 0, + 7, + 4, + 5, + 6, + 1, + 2, + 3, + 8, + }, + ec_gf8_mul_B2_ops}; + +static ec_gf_op_t ec_gf8_mul_B3_ops[] = { + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_COPY, 9, 5, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR3, 8, 6, 4}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 8, 5, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 7, 8, 0}, + {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_XOR3, 1, 9, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_B3 = {10, + { + 2, + 3, + 4, + 5, + 1, + 6, + 0, + 7, + 8, + 9, + }, + ec_gf8_mul_B3_ops}; + +static ec_gf_op_t ec_gf8_mul_B4_ops[] = { + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_B4 = {8, + { + 5, + 6, + 7, + 0, + 1, + 2, + 3, + 4, + }, + ec_gf8_mul_B4_ops}; + +static ec_gf_op_t ec_gf8_mul_B5_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_COPY, 8, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR3, 4, 8, 3}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_B5 = {9, + { + 3, + 4, + 0, + 7, + 1, + 5, + 6, + 2, + 8, + }, + ec_gf8_mul_B5_ops}; + +static ec_gf_op_t ec_gf8_mul_B6_ops[] = { + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_B6 = {8, + { + 5, + 3, + 6, + 4, + 7, + 0, + 1, + 2, + }, + ec_gf8_mul_B6_ops}; + +static ec_gf_op_t ec_gf8_mul_B7_ops[] = { + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_B7 = {8, + { + 5, + 0, + 1, + 4, + 2, + 6, + 7, + 3, + }, + ec_gf8_mul_B7_ops}; + +static ec_gf_op_t ec_gf8_mul_B8_ops[] = { + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_B8 = {8, + { + 6, + 4, + 5, + 1, + 2, + 0, + 7, + 3, + }, + ec_gf8_mul_B8_ops}; + +static ec_gf_op_t ec_gf8_mul_B9_ops[] = { + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR3, 0, 8, 2}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_B9 = {9, + { + 6, + 7, + 0, + 2, + 1, + 4, + 5, + 3, + 8, + }, + ec_gf8_mul_B9_ops}; + +static ec_gf_op_t ec_gf8_mul_BA_ops[] = { + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_BA = {8, + { + 1, + 2, + 4, + 3, + 5, + 6, + 0, + 7, + }, + ec_gf8_mul_BA_ops}; + +static ec_gf_op_t ec_gf8_mul_BB_ops[] = { + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_COPY, 8, 3, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 8, 5, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 8, 7, 0}, {EC_GF_OP_XOR2, 2, 8, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_BB = {9, + { + 7, + 2, + 1, + 8, + 3, + 5, + 6, + 4, + 0, + }, + ec_gf8_mul_BB_ops}; + +static ec_gf_op_t ec_gf8_mul_BC_ops[] = { + {EC_GF_OP_COPY, 8, 1, 0}, {EC_GF_OP_XOR2, 8, 2, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR3, 2, 8, 4}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_BC = {9, + { + 2, + 6, + 3, + 4, + 5, + 1, + 7, + 0, + 8, + }, + ec_gf8_mul_BC_ops}; + +static ec_gf_op_t ec_gf8_mul_BD_ops[] = { + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_BD = {8, + { + 4, + 5, + 0, + 2, + 7, + 1, + 6, + 3, + }, + ec_gf8_mul_BD_ops}; + +static ec_gf_op_t ec_gf8_mul_BE_ops[] = { + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_BE = {8, + { + 0, + 6, + 7, + 4, + 5, + 1, + 3, + 2, + }, + ec_gf8_mul_BE_ops}; + +static ec_gf_op_t ec_gf8_mul_BF_ops[] = { + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_BF = {8, + { + 5, + 6, + 1, + 7, + 3, + 0, + 2, + 4, + }, + ec_gf8_mul_BF_ops}; + +static ec_gf_op_t ec_gf8_mul_C0_ops[] = { + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_C0 = {8, + { + 1, + 2, + 3, + 4, + 7, + 5, + 6, + 0, + }, + ec_gf8_mul_C0_ops}; + +static ec_gf_op_t ec_gf8_mul_C1_ops[] = { + {EC_GF_OP_XOR3, 8, 1, 2}, {EC_GF_OP_XOR2, 8, 3, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_XOR2, 5, 8, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_C1 = {9, + { + 5, + 6, + 7, + 4, + 1, + 2, + 3, + 0, + 8, + }, + ec_gf8_mul_C1_ops}; + +static ec_gf_op_t ec_gf8_mul_C2_ops[] = { + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_C2 = {8, + { + 7, + 6, + 3, + 0, + 1, + 4, + 5, + 2, + }, + ec_gf8_mul_C2_ops}; + +static ec_gf_op_t ec_gf8_mul_C3_ops[] = { + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_XOR3, 0, 2, 6}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR3, 9, 1, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 7, 9, 0}, + {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_C3 = {10, + { + 5, + 6, + 4, + 7, + 1, + 2, + 3, + 0, + 8, + 9, + }, + ec_gf8_mul_C3_ops}; + +static ec_gf_op_t ec_gf8_mul_C4_ops[] = { + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 1, 0, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_C4 = {8, + { + 0, + 2, + 1, + 3, + 4, + 5, + 6, + 7, + }, + ec_gf8_mul_C4_ops}; + +static ec_gf_op_t ec_gf8_mul_C5_ops[] = { + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_C5 = {8, + { + 4, + 3, + 5, + 7, + 6, + 2, + 0, + 1, + }, + ec_gf8_mul_C5_ops}; + +static ec_gf_op_t ec_gf8_mul_C6_ops[] = { + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_COPY, 8, 4, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR3, 9, 5, 4}, + {EC_GF_OP_XOR2, 6, 9, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 7, 9, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 6, 8, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_C6 = {10, + { + 6, + 3, + 0, + 4, + 5, + 7, + 2, + 1, + 8, + 9, + }, + ec_gf8_mul_C6_ops}; + +static ec_gf_op_t ec_gf8_mul_C7_ops[] = { + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_C7 = {8, + { + 7, + 0, + 6, + 2, + 5, + 3, + 4, + 1, + }, + ec_gf8_mul_C7_ops}; + +static ec_gf_op_t ec_gf8_mul_C8_ops[] = { + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_C8 = {8, + { + 1, + 3, + 2, + 4, + 6, + 7, + 5, + 0, + }, + ec_gf8_mul_C8_ops}; + +static ec_gf_op_t ec_gf8_mul_C9_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_C9 = {8, + { + 2, + 3, + 4, + 5, + 6, + 7, + 0, + 1, + }, + ec_gf8_mul_C9_ops}; + +static ec_gf_op_t ec_gf8_mul_CA_ops[] = { + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_CA = {8, + { + 1, + 2, + 5, + 7, + 3, + 4, + 0, + 6, + }, + ec_gf8_mul_CA_ops}; + +static ec_gf_op_t ec_gf8_mul_CB_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_CB = {8, + { + 2, + 3, + 4, + 5, + 7, + 6, + 0, + 1, + }, + ec_gf8_mul_CB_ops}; + +static ec_gf_op_t ec_gf8_mul_CC_ops[] = { + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_CC = {8, + { + 2, + 7, + 1, + 0, + 5, + 6, + 3, + 4, + }, + ec_gf8_mul_CC_ops}; + +static ec_gf_op_t ec_gf8_mul_CD_ops[] = { + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_CD = {8, + { + 0, + 6, + 1, + 2, + 7, + 3, + 4, + 5, + }, + ec_gf8_mul_CD_ops}; + +static ec_gf_op_t ec_gf8_mul_CE_ops[] = { + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_COPY, 8, 7, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR3, 3, 6, 8}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR3, 8, 2, 3}, + {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 4, 8, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_CE = {9, + { + 5, + 7, + 3, + 0, + 2, + 6, + 4, + 1, + 8, + }, + ec_gf8_mul_CE_ops}; + +static ec_gf_op_t ec_gf8_mul_CF_ops[] = { + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_CF = {8, + { + 3, + 6, + 7, + 0, + 2, + 4, + 5, + 1, + }, + ec_gf8_mul_CF_ops}; + +static ec_gf_op_t ec_gf8_mul_D0_ops[] = { + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_D0 = {8, + { + 5, + 6, + 7, + 2, + 0, + 3, + 1, + 4, + }, + ec_gf8_mul_D0_ops}; + +static ec_gf_op_t ec_gf8_mul_D1_ops[] = { + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR3, 8, 6, 0}, + {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 1, 8, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_D1 = {9, + { + 5, + 6, + 3, + 2, + 0, + 7, + 4, + 1, + 8, + }, + ec_gf8_mul_D1_ops}; + +static ec_gf_op_t ec_gf8_mul_D2_ops[] = { + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_D2 = {8, + { + 7, + 0, + 2, + 1, + 3, + 4, + 6, + 5, + }, + ec_gf8_mul_D2_ops}; + +static ec_gf_op_t ec_gf8_mul_D3_ops[] = { + {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_COPY, 8, 4, 0}, + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 8, 6, 0}, {EC_GF_OP_XOR2, 3, 8, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_D3 = {9, + { + 0, + 3, + 2, + 8, + 4, + 6, + 7, + 1, + 5, + }, + ec_gf8_mul_D3_ops}; + +static ec_gf_op_t ec_gf8_mul_D4_ops[] = { + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_COPY, 8, 1, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR3, 1, 7, 8}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_D4 = {9, + { + 4, + 1, + 7, + 5, + 0, + 6, + 3, + 2, + 8, + }, + ec_gf8_mul_D4_ops}; + +static ec_gf_op_t ec_gf8_mul_D5_ops[] = { + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_D5 = {8, + { + 6, + 7, + 4, + 5, + 2, + 3, + 1, + 0, + }, + ec_gf8_mul_D5_ops}; + +static ec_gf_op_t ec_gf8_mul_D6_ops[] = { + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_D6 = {9, + { + 0, + 6, + 2, + 7, + 1, + 3, + 4, + 5, + 8, + }, + ec_gf8_mul_D6_ops}; + +static ec_gf_op_t ec_gf8_mul_D7_ops[] = { + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR3, 8, 3, 5}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR3, 6, 7, 8}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_D7 = {9, + { + 3, + 4, + 6, + 5, + 0, + 7, + 1, + 2, + 8, + }, + ec_gf8_mul_D7_ops}; + +static ec_gf_op_t ec_gf8_mul_D8_ops[] = { + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_D8 = {8, + { + 4, + 5, + 6, + 7, + 0, + 1, + 2, + 3, + }, + ec_gf8_mul_D8_ops}; + +static ec_gf_op_t ec_gf8_mul_D9_ops[] = { + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 0, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_D9 = {8, + { + 1, + 2, + 6, + 7, + 4, + 5, + 0, + 3, + }, + ec_gf8_mul_D9_ops}; + +static ec_gf_op_t ec_gf8_mul_DA_ops[] = { + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR3, 8, 2, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 8, 0}, + {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 2, 4, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_DA = {9, + { + 2, + 5, + 7, + 1, + 0, + 4, + 3, + 6, + 8, + }, + ec_gf8_mul_DA_ops}; + +static ec_gf_op_t ec_gf8_mul_DB_ops[] = { + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 8, 4, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 8, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_DB = {9, + { + 7, + 5, + 6, + 2, + 3, + 4, + 1, + 0, + 8, + }, + ec_gf8_mul_DB_ops}; + +static ec_gf_op_t ec_gf8_mul_DC_ops[] = { + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_DC = {8, + { + 4, + 5, + 2, + 6, + 7, + 1, + 0, + 3, + }, + ec_gf8_mul_DC_ops}; + +static ec_gf_op_t ec_gf8_mul_DD_ops[] = { + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_DD = {8, + { + 1, + 2, + 3, + 6, + 7, + 0, + 4, + 5, + }, + ec_gf8_mul_DD_ops}; + +static ec_gf_op_t ec_gf8_mul_DE_ops[] = { + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_DE = {8, + { + 0, + 5, + 2, + 6, + 7, + 1, + 3, + 4, + }, + ec_gf8_mul_DE_ops}; + +static ec_gf_op_t ec_gf8_mul_DF_ops[] = { + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 8, 3, 0}, + {EC_GF_OP_COPY, 9, 0, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 8, 7, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 5, 8, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR3, 1, 9, 2}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_DF = {10, + { + 7, + 2, + 8, + 4, + 3, + 1, + 0, + 6, + 5, + 9, + }, + ec_gf8_mul_DF_ops}; + +static ec_gf_op_t ec_gf8_mul_E0_ops[] = { + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 7, 1, 0}, + {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_E0 = {8, + { + 2, + 3, + 4, + 7, + 5, + 6, + 0, + 1, + }, + ec_gf8_mul_E0_ops}; + +static ec_gf_op_t ec_gf8_mul_E1_ops[] = { + {EC_GF_OP_COPY, 8, 1, 0}, {EC_GF_OP_XOR2, 8, 7, 0}, + {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR3, 9, 5, 3}, + {EC_GF_OP_XOR2, 0, 9, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 4, 9, 0}, {EC_GF_OP_XOR2, 0, 2, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 2, 8, 0}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_E1 = {10, + { + 0, + 7, + 1, + 3, + 4, + 5, + 6, + 2, + 8, + 9, + }, + ec_gf8_mul_E1_ops}; + +static ec_gf_op_t ec_gf8_mul_E2_ops[] = { + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_E2 = {8, + { + 2, + 3, + 7, + 1, + 5, + 6, + 0, + 4, + }, + ec_gf8_mul_E2_ops}; + +static ec_gf_op_t ec_gf8_mul_E3_ops[] = { + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 3, 1, 0}, + {EC_GF_OP_XOR3, 8, 2, 7}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 0, 1, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR3, 6, 8, 4}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_E3 = {9, + { + 5, + 4, + 7, + 2, + 1, + 3, + 6, + 0, + 8, + }, + ec_gf8_mul_E3_ops}; + +static ec_gf_op_t ec_gf8_mul_E4_ops[] = { + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 5, 0}, + {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_E4 = {8, + { + 7, + 0, + 1, + 6, + 3, + 4, + 2, + 5, + }, + ec_gf8_mul_E4_ops}; + +static ec_gf_op_t ec_gf8_mul_E5_ops[] = { + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_E5 = {9, + { + 4, + 5, + 3, + 6, + 7, + 1, + 0, + 2, + 8, + }, + ec_gf8_mul_E5_ops}; + +static ec_gf_op_t ec_gf8_mul_E6_ops[] = { + {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_E6 = {8, + { + 5, + 4, + 3, + 6, + 7, + 0, + 1, + 2, + }, + ec_gf8_mul_E6_ops}; + +static ec_gf_op_t ec_gf8_mul_E7_ops[] = { + {EC_GF_OP_COPY, 8, 6, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR3, 9, 0, 6}, {EC_GF_OP_XOR2, 4, 9, 0}, + {EC_GF_OP_XOR2, 5, 9, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_E7 = {10, + { + 1, + 4, + 3, + 6, + 7, + 5, + 2, + 0, + 8, + 9, + }, + ec_gf8_mul_E7_ops}; + +static ec_gf_op_t ec_gf8_mul_E8_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 2, 5, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 1, 4, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_E8 = {8, + { + 1, + 4, + 2, + 7, + 3, + 0, + 5, + 6, + }, + ec_gf8_mul_E8_ops}; + +static ec_gf_op_t ec_gf8_mul_E9_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_COPY, 8, 1, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 6, 3, 0}, + {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR3, 1, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_E9 = {9, + { + 6, + 2, + 0, + 3, + 4, + 1, + 5, + 7, + 8, + }, + ec_gf8_mul_E9_ops}; + +static ec_gf_op_t ec_gf8_mul_EA_ops[] = { + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_EA = {8, + { + 3, + 4, + 5, + 6, + 7, + 0, + 1, + 2, + }, + ec_gf8_mul_EA_ops}; + +static ec_gf_op_t ec_gf8_mul_EB_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_EB = {8, + { + 3, + 4, + 5, + 6, + 7, + 0, + 1, + 2, + }, + ec_gf8_mul_EB_ops}; + +static ec_gf_op_t ec_gf8_mul_EC_ops[] = { + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR3, 8, 4, 0}, {EC_GF_OP_XOR2, 1, 8, 0}, + {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_EC = {9, + { + 7, + 4, + 3, + 0, + 2, + 5, + 1, + 6, + 8, + }, + ec_gf8_mul_EC_ops}; + +static ec_gf_op_t ec_gf8_mul_ED_ops[] = { + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 0, 0}, + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_ED = {8, + { + 5, + 6, + 7, + 0, + 1, + 4, + 3, + 2, + }, + ec_gf8_mul_ED_ops}; + +static ec_gf_op_t ec_gf8_mul_EE_ops[] = { + {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR3, 8, 2, 3}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 4, 8, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 8, 5, 0}, + {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 7, 8, 0}, + {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_EE = {9, + { + 6, + 4, + 5, + 7, + 2, + 3, + 0, + 1, + 8, + }, + ec_gf8_mul_EE_ops}; + +static ec_gf_op_t ec_gf8_mul_EF_ops[] = { + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_COPY, 8, 0, 0}, + {EC_GF_OP_XOR2, 8, 2, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 7, 8, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 6, 8, 0}, + {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_EF = {9, + { + 6, + 4, + 5, + 7, + 2, + 0, + 3, + 1, + 8, + }, + ec_gf8_mul_EF_ops}; + +static ec_gf_op_t ec_gf8_mul_F0_ops[] = { + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR3, 8, 3, 6}, + {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 8, 4, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 7, 8, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 1, 8, 0}, + {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_F0 = {9, + { + 3, + 4, + 6, + 1, + 2, + 0, + 5, + 7, + 8, + }, + ec_gf8_mul_F0_ops}; + +static ec_gf_op_t ec_gf8_mul_F1_ops[] = { + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_COPY, 8, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_COPY, 9, 2, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 9, 0, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 5, 2, 0}, + {EC_GF_OP_XOR2, 7, 9, 0}, {EC_GF_OP_XOR2, 4, 9, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR3, 9, 8, 7}, + {EC_GF_OP_XOR2, 1, 9, 0}, {EC_GF_OP_XOR2, 5, 9, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_F1 = {10, + { + 7, + 2, + 6, + 3, + 5, + 1, + 4, + 0, + 8, + 9, + }, + ec_gf8_mul_F1_ops}; + +static ec_gf_op_t ec_gf8_mul_F2_ops[] = { + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 7, 2, 0}, + {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 2, 3, 0}, + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR3, 8, 6, 4}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR2, 5, 8, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_F2 = {9, + { + 1, + 0, + 6, + 7, + 4, + 5, + 2, + 3, + 8, + }, + ec_gf8_mul_F2_ops}; + +static ec_gf_op_t ec_gf8_mul_F3_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_F3 = {8, + { + 5, + 6, + 7, + 0, + 1, + 2, + 3, + 4, + }, + ec_gf8_mul_F3_ops}; + +static ec_gf_op_t ec_gf8_mul_F4_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 3, 7, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_F4 = {8, + { + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + }, + ec_gf8_mul_F4_ops}; + +static ec_gf_op_t ec_gf8_mul_F5_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_F5 = {8, + { + 7, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + }, + ec_gf8_mul_F5_ops}; + +static ec_gf_op_t ec_gf8_mul_F6_ops[] = { + {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_COPY, 8, 3, 0}, + {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_COPY, 9, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 9, 4, 0}, {EC_GF_OP_XOR2, 4, 1, 0}, + {EC_GF_OP_XOR2, 6, 9, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 7, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR3, 7, 8, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_F6 = {10, + { + 0, + 6, + 2, + 7, + 4, + 3, + 5, + 9, + 1, + 8, + }, + ec_gf8_mul_F6_ops}; + +static ec_gf_op_t ec_gf8_mul_F7_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0}, + {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_F7 = {8, + { + 6, + 7, + 0, + 1, + 2, + 3, + 4, + 5, + }, + ec_gf8_mul_F7_ops}; + +static ec_gf_op_t ec_gf8_mul_F8_ops[] = { + {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 3, 5, 0}, + {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 1, 6, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 5, 1, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 6, 7, 0}, + {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_F8 = {8, + { + 6, + 2, + 0, + 1, + 4, + 5, + 3, + 7, + }, + ec_gf8_mul_F8_ops}; + +static ec_gf_op_t ec_gf8_mul_F9_ops[] = { + {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 0, 5, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 4, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR3, 8, 7, 1}, {EC_GF_OP_XOR2, 1, 3, 0}, + {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 5, 8, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_F9 = {9, + { + 4, + 1, + 7, + 6, + 0, + 3, + 5, + 2, + 8, + }, + ec_gf8_mul_F9_ops}; + +static ec_gf_op_t ec_gf8_mul_FA_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0}, + {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 1, 5, 0}, + {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0}, + {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 3, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_FA = {8, + { + 0, + 1, + 2, + 4, + 5, + 6, + 7, + 3, + }, + ec_gf8_mul_FA_ops}; + +static ec_gf_op_t ec_gf8_mul_FB_ops[] = { + {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 3, 2, 0}, + {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 2, 7, 0}, + {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 7, 6, 0}, + {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0}, + {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 5, 4, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_FB = {8, + { + 4, + 5, + 6, + 7, + 0, + 1, + 2, + 3, + }, + ec_gf8_mul_FB_ops}; + +static ec_gf_op_t ec_gf8_mul_FC_ops[] = { + {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 7, 4, 0}, + {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_COPY, 9, 3, 0}, + {EC_GF_OP_XOR3, 8, 5, 7}, {EC_GF_OP_XOR2, 3, 6, 0}, + {EC_GF_OP_XOR2, 8, 3, 0}, {EC_GF_OP_XOR2, 2, 8, 0}, + {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 4, 2, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 3, 4, 0}, + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 6, 0, 0}, + {EC_GF_OP_XOR3, 0, 9, 2}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_FC = {10, + { + 5, + 6, + 3, + 7, + 1, + 8, + 0, + 4, + 2, + 9, + }, + ec_gf8_mul_FC_ops}; + +static ec_gf_op_t ec_gf8_mul_FD_ops[] = { + {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_COPY, 8, 7, 0}, + {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 7, 5, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 4, 7, 0}, + {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 0, 4, 0}, + {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 1, 2, 0}, + {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 1, 0}, + {EC_GF_OP_XOR3, 1, 8, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_FD = {9, + { + 5, + 3, + 7, + 6, + 1, + 2, + 4, + 0, + 8, + }, + ec_gf8_mul_FD_ops}; + +static ec_gf_op_t ec_gf8_mul_FE_ops[] = { + {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_COPY, 8, 2, 0}, + {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 6, 2, 0}, + {EC_GF_OP_XOR2, 8, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0}, + {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 0, 6, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR2, 3, 0, 0}, + {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_FE = {9, + { + 3, + 4, + 8, + 2, + 5, + 0, + 6, + 1, + 7, + }, + ec_gf8_mul_FE_ops}; + +static ec_gf_op_t ec_gf8_mul_FF_ops[] = { + {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_COPY, 9, 0, 0}, + {EC_GF_OP_COPY, 8, 4, 0}, {EC_GF_OP_XOR2, 9, 1, 0}, + {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 9, 4, 0}, + {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 2, 0, 0}, + {EC_GF_OP_XOR2, 3, 9, 0}, {EC_GF_OP_XOR2, 7, 3, 0}, + {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 5, 3, 0}, + {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 1, 7, 0}, + {EC_GF_OP_XOR3, 3, 8, 5}, {EC_GF_OP_XOR2, 4, 6, 0}, + {EC_GF_OP_END, 0, 0, 0}}; + +static ec_gf_mul_t ec_gf8_mul_FF = {10, + { + 6, + 5, + 0, + 1, + 2, + 4, + 9, + 3, + 7, + 8, + }, + ec_gf8_mul_FF_ops}; + +ec_gf_mul_t *ec_gf8_mul[] = { + &ec_gf8_mul_00, &ec_gf8_mul_01, &ec_gf8_mul_02, &ec_gf8_mul_03, + &ec_gf8_mul_04, &ec_gf8_mul_05, &ec_gf8_mul_06, &ec_gf8_mul_07, + &ec_gf8_mul_08, &ec_gf8_mul_09, &ec_gf8_mul_0A, &ec_gf8_mul_0B, + &ec_gf8_mul_0C, &ec_gf8_mul_0D, &ec_gf8_mul_0E, &ec_gf8_mul_0F, + &ec_gf8_mul_10, &ec_gf8_mul_11, &ec_gf8_mul_12, &ec_gf8_mul_13, + &ec_gf8_mul_14, &ec_gf8_mul_15, &ec_gf8_mul_16, &ec_gf8_mul_17, + &ec_gf8_mul_18, &ec_gf8_mul_19, &ec_gf8_mul_1A, &ec_gf8_mul_1B, + &ec_gf8_mul_1C, &ec_gf8_mul_1D, &ec_gf8_mul_1E, &ec_gf8_mul_1F, + &ec_gf8_mul_20, &ec_gf8_mul_21, &ec_gf8_mul_22, &ec_gf8_mul_23, + &ec_gf8_mul_24, &ec_gf8_mul_25, &ec_gf8_mul_26, &ec_gf8_mul_27, + &ec_gf8_mul_28, &ec_gf8_mul_29, &ec_gf8_mul_2A, &ec_gf8_mul_2B, + &ec_gf8_mul_2C, &ec_gf8_mul_2D, &ec_gf8_mul_2E, &ec_gf8_mul_2F, + &ec_gf8_mul_30, &ec_gf8_mul_31, &ec_gf8_mul_32, &ec_gf8_mul_33, + &ec_gf8_mul_34, &ec_gf8_mul_35, &ec_gf8_mul_36, &ec_gf8_mul_37, + &ec_gf8_mul_38, &ec_gf8_mul_39, &ec_gf8_mul_3A, &ec_gf8_mul_3B, + &ec_gf8_mul_3C, &ec_gf8_mul_3D, &ec_gf8_mul_3E, &ec_gf8_mul_3F, + &ec_gf8_mul_40, &ec_gf8_mul_41, &ec_gf8_mul_42, &ec_gf8_mul_43, + &ec_gf8_mul_44, &ec_gf8_mul_45, &ec_gf8_mul_46, &ec_gf8_mul_47, + &ec_gf8_mul_48, &ec_gf8_mul_49, &ec_gf8_mul_4A, &ec_gf8_mul_4B, + &ec_gf8_mul_4C, &ec_gf8_mul_4D, &ec_gf8_mul_4E, &ec_gf8_mul_4F, + &ec_gf8_mul_50, &ec_gf8_mul_51, &ec_gf8_mul_52, &ec_gf8_mul_53, + &ec_gf8_mul_54, &ec_gf8_mul_55, &ec_gf8_mul_56, &ec_gf8_mul_57, + &ec_gf8_mul_58, &ec_gf8_mul_59, &ec_gf8_mul_5A, &ec_gf8_mul_5B, + &ec_gf8_mul_5C, &ec_gf8_mul_5D, &ec_gf8_mul_5E, &ec_gf8_mul_5F, + &ec_gf8_mul_60, &ec_gf8_mul_61, &ec_gf8_mul_62, &ec_gf8_mul_63, + &ec_gf8_mul_64, &ec_gf8_mul_65, &ec_gf8_mul_66, &ec_gf8_mul_67, + &ec_gf8_mul_68, &ec_gf8_mul_69, &ec_gf8_mul_6A, &ec_gf8_mul_6B, + &ec_gf8_mul_6C, &ec_gf8_mul_6D, &ec_gf8_mul_6E, &ec_gf8_mul_6F, + &ec_gf8_mul_70, &ec_gf8_mul_71, &ec_gf8_mul_72, &ec_gf8_mul_73, + &ec_gf8_mul_74, &ec_gf8_mul_75, &ec_gf8_mul_76, &ec_gf8_mul_77, + &ec_gf8_mul_78, &ec_gf8_mul_79, &ec_gf8_mul_7A, &ec_gf8_mul_7B, + &ec_gf8_mul_7C, &ec_gf8_mul_7D, &ec_gf8_mul_7E, &ec_gf8_mul_7F, + &ec_gf8_mul_80, &ec_gf8_mul_81, &ec_gf8_mul_82, &ec_gf8_mul_83, + &ec_gf8_mul_84, &ec_gf8_mul_85, &ec_gf8_mul_86, &ec_gf8_mul_87, + &ec_gf8_mul_88, &ec_gf8_mul_89, &ec_gf8_mul_8A, &ec_gf8_mul_8B, + &ec_gf8_mul_8C, &ec_gf8_mul_8D, &ec_gf8_mul_8E, &ec_gf8_mul_8F, + &ec_gf8_mul_90, &ec_gf8_mul_91, &ec_gf8_mul_92, &ec_gf8_mul_93, + &ec_gf8_mul_94, &ec_gf8_mul_95, &ec_gf8_mul_96, &ec_gf8_mul_97, + &ec_gf8_mul_98, &ec_gf8_mul_99, &ec_gf8_mul_9A, &ec_gf8_mul_9B, + &ec_gf8_mul_9C, &ec_gf8_mul_9D, &ec_gf8_mul_9E, &ec_gf8_mul_9F, + &ec_gf8_mul_A0, &ec_gf8_mul_A1, &ec_gf8_mul_A2, &ec_gf8_mul_A3, + &ec_gf8_mul_A4, &ec_gf8_mul_A5, &ec_gf8_mul_A6, &ec_gf8_mul_A7, + &ec_gf8_mul_A8, &ec_gf8_mul_A9, &ec_gf8_mul_AA, &ec_gf8_mul_AB, + &ec_gf8_mul_AC, &ec_gf8_mul_AD, &ec_gf8_mul_AE, &ec_gf8_mul_AF, + &ec_gf8_mul_B0, &ec_gf8_mul_B1, &ec_gf8_mul_B2, &ec_gf8_mul_B3, + &ec_gf8_mul_B4, &ec_gf8_mul_B5, &ec_gf8_mul_B6, &ec_gf8_mul_B7, + &ec_gf8_mul_B8, &ec_gf8_mul_B9, &ec_gf8_mul_BA, &ec_gf8_mul_BB, + &ec_gf8_mul_BC, &ec_gf8_mul_BD, &ec_gf8_mul_BE, &ec_gf8_mul_BF, + &ec_gf8_mul_C0, &ec_gf8_mul_C1, &ec_gf8_mul_C2, &ec_gf8_mul_C3, + &ec_gf8_mul_C4, &ec_gf8_mul_C5, &ec_gf8_mul_C6, &ec_gf8_mul_C7, + &ec_gf8_mul_C8, &ec_gf8_mul_C9, &ec_gf8_mul_CA, &ec_gf8_mul_CB, + &ec_gf8_mul_CC, &ec_gf8_mul_CD, &ec_gf8_mul_CE, &ec_gf8_mul_CF, + &ec_gf8_mul_D0, &ec_gf8_mul_D1, &ec_gf8_mul_D2, &ec_gf8_mul_D3, + &ec_gf8_mul_D4, &ec_gf8_mul_D5, &ec_gf8_mul_D6, &ec_gf8_mul_D7, + &ec_gf8_mul_D8, &ec_gf8_mul_D9, &ec_gf8_mul_DA, &ec_gf8_mul_DB, + &ec_gf8_mul_DC, &ec_gf8_mul_DD, &ec_gf8_mul_DE, &ec_gf8_mul_DF, + &ec_gf8_mul_E0, &ec_gf8_mul_E1, &ec_gf8_mul_E2, &ec_gf8_mul_E3, + &ec_gf8_mul_E4, &ec_gf8_mul_E5, &ec_gf8_mul_E6, &ec_gf8_mul_E7, + &ec_gf8_mul_E8, &ec_gf8_mul_E9, &ec_gf8_mul_EA, &ec_gf8_mul_EB, + &ec_gf8_mul_EC, &ec_gf8_mul_ED, &ec_gf8_mul_EE, &ec_gf8_mul_EF, + &ec_gf8_mul_F0, &ec_gf8_mul_F1, &ec_gf8_mul_F2, &ec_gf8_mul_F3, + &ec_gf8_mul_F4, &ec_gf8_mul_F5, &ec_gf8_mul_F6, &ec_gf8_mul_F7, + &ec_gf8_mul_F8, &ec_gf8_mul_F9, &ec_gf8_mul_FA, &ec_gf8_mul_FB, + &ec_gf8_mul_FC, &ec_gf8_mul_FD, &ec_gf8_mul_FE, &ec_gf8_mul_FF}; diff --git a/xlators/cluster/ec/src/ec-gf8.h b/xlators/cluster/ec/src/ec-gf8.h new file mode 100644 index 00000000000..4aca91127fc --- /dev/null +++ b/xlators/cluster/ec/src/ec-gf8.h @@ -0,0 +1,18 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_GF8_H__ +#define __EC_GF8_H__ + +#include "ec-galois.h" + +extern ec_gf_mul_t *ec_gf8_mul[]; + +#endif /* __EC_GF8_H__ */ diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c new file mode 100644 index 00000000000..7d991f04aac --- /dev/null +++ b/xlators/cluster/ec/src/ec-heal.c @@ -0,0 +1,3367 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <glusterfs/defaults.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/syncop.h> +#include <glusterfs/syncop-utils.h> +#include <glusterfs/cluster-syncop.h> + +#include "ec.h" +#include "ec-types.h" +#include "ec-messages.h" +#include "ec-helpers.h" +#include "ec-common.h" +#include "ec-combine.h" +#include "ec-method.h" +#include "ec-fops.h" +#include "ec-heald.h" + +#define EC_COUNT(array, max) \ + ({ \ + int __i; \ + int __res = 0; \ + for (__i = 0; __i < max; __i++) \ + if (array[__i]) \ + __res++; \ + __res; \ + }) +#define EC_INTERSECT(dst, src1, src2, max) \ + ({ \ + int __i; \ + for (__i = 0; __i < max; __i++) \ + dst[__i] = src1[__i] && src2[__i]; \ + }) +#define EC_ADJUST_SOURCE(source, sources, max) \ + ({ \ + int __i; \ + if (sources[source] == 0) { \ + source = -1; \ + for (__i = 0; __i < max; __i++) \ + if (sources[__i]) \ + source = __i; \ + } \ + }) +#define IA_EQUAL(f, s, field) \ + (memcmp(&(f.ia_##field), &(s.ia_##field), sizeof(s.ia_##field)) == 0) +#define EC_REPLIES_ALLOC(replies, numsubvols) \ + do { \ + int __i = 0; \ + replies = alloca0(numsubvols * sizeof(*replies)); \ + for (__i = 0; __i < numsubvols; __i++) \ + INIT_LIST_HEAD(&replies[__i].entries.list); \ + } while (0) + +struct ec_name_data { + call_frame_t *frame; + unsigned char *participants; + unsigned char *failed_on; + unsigned char *gfidless; + unsigned char *enoent; + unsigned char *same; + char *name; + inode_t *parent; + default_args_cbk_t *replies; + uint32_t heal_pending; +}; + +static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL}; + +static gf_boolean_t +ec_ignorable_key_match(dict_t *dict, char *key, data_t *val, void *mdata) +{ + int i = 0; + + if (!key) + goto out; + + if (strncmp(key, EC_XATTR_PREFIX, SLEN(EC_XATTR_PREFIX)) == 0) + return _gf_true; + + for (i = 0; ec_ignore_xattrs[i]; i++) { + if (!strcmp(key, ec_ignore_xattrs[i])) + return _gf_true; + } + +out: + return _gf_false; +} + +static gf_boolean_t +ec_sh_key_match(dict_t *dict, char *key, data_t *val, void *mdata) +{ + return !ec_ignorable_key_match(dict, key, val, mdata); +} +/* FOP: heal */ + +void +ec_set_entry_healing(ec_fop_data_t *fop) +{ + ec_inode_t *ctx = NULL; + loc_t *loc = NULL; + + if (!fop) + return; + + loc = &fop->loc[0]; + LOCK(&loc->inode->lock); + { + ctx = __ec_inode_get(loc->inode, fop->xl); + if (ctx) { + ctx->heal_count += 1; + } + } + UNLOCK(&loc->inode->lock); +} + +void +ec_reset_entry_healing(ec_fop_data_t *fop) +{ + ec_inode_t *ctx = NULL; + loc_t *loc = NULL; + int32_t heal_count = 0; + if (!fop) + return; + + loc = &fop->loc[0]; + LOCK(&loc->inode->lock); + { + ctx = __ec_inode_get(loc->inode, fop->xl); + if (ctx) { + ctx->heal_count += -1; + heal_count = ctx->heal_count; + } + } + UNLOCK(&loc->inode->lock); + GF_ASSERT(heal_count >= 0); +} + +uintptr_t +ec_heal_check(ec_fop_data_t *fop, uintptr_t *pgood) +{ + ec_cbk_data_t *cbk; + uintptr_t mask[2] = {0, 0}; + + list_for_each_entry(cbk, &fop->cbk_list, list) + { + mask[cbk->op_ret >= 0] |= cbk->mask; + } + + if (pgood != NULL) { + *pgood = mask[1]; + } + + return mask[0]; +} + +void +ec_heal_update(ec_fop_data_t *fop, int32_t is_open) +{ + ec_heal_t *heal = fop->data; + uintptr_t good, bad; + + bad = ec_heal_check(fop, &good); + + LOCK(&heal->lock); + + heal->bad &= ~bad; + if (is_open) { + heal->open |= good; + } + + UNLOCK(&heal->lock); + + fop->error = 0; +} + +void +ec_heal_avoid(ec_fop_data_t *fop) +{ + ec_heal_t *heal = fop->data; + uintptr_t bad; + + bad = ec_heal_check(fop, NULL); + + LOCK(&heal->lock); + + heal->good &= ~bad; + + UNLOCK(&heal->lock); +} + +int32_t +ec_heal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + ec_heal_t *heal = fop->data; + + if (op_ret >= 0) { + GF_ASSERT( + ec_set_inode_size(heal->fop, heal->fd->inode, heal->total_size)); + } + + return 0; +} + +void +ec_heal_lock(ec_heal_t *heal, int32_t type, fd_t *fd, loc_t *loc, off_t offset, + size_t size) +{ + struct gf_flock flock; + fop_inodelk_cbk_t cbk = NULL; + + flock.l_type = type; + flock.l_whence = SEEK_SET; + flock.l_start = offset; + flock.l_len = size; + flock.l_pid = 0; + flock.l_owner.len = 0; + + if (type == F_UNLCK) { + /* Remove inode size information before unlocking it. */ + if (fd == NULL) { + ec_clear_inode_info(heal->fop, heal->loc.inode); + } else { + ec_clear_inode_info(heal->fop, heal->fd->inode); + } + cbk = ec_lock_unlocked; + } else { + /* Otherwise use the callback to update size information. */ + cbk = ec_heal_lock_cbk; + } + + if (fd != NULL) { + ec_finodelk(heal->fop->frame, heal->xl, + &heal->fop->frame->root->lk_owner, heal->fop->mask, + EC_MINIMUM_ALL, cbk, heal, heal->xl->name, fd, F_SETLKW, + &flock, NULL); + } else { + ec_inodelk(heal->fop->frame, heal->xl, + &heal->fop->frame->root->lk_owner, heal->fop->mask, + EC_MINIMUM_ALL, cbk, heal, heal->xl->name, loc, F_SETLKW, + &flock, NULL); + } +} + +void +ec_heal_inodelk(ec_heal_t *heal, int32_t type, int32_t use_fd, off_t offset, + size_t size) +{ + ec_heal_lock(heal, type, use_fd ? heal->fd : NULL, &heal->loc, offset, + size); +} + +int32_t +ec_heal_xattr_clean(dict_t *dict, char *key, data_t *data, void *arg) +{ + dict_t *base = arg; + + if (ec_ignorable_key_match(NULL, key, NULL, NULL)) { + dict_del(dict, key); + return 0; + } + + if (dict_get(base, key) != NULL) + dict_del(dict, key); + + return 0; +} + +/******************************************************************** + * ec_wind_xattrop_parallel: + * Helper function to update the extended attributes + * in parallel. + * + *******************************************************************/ +void +ec_wind_xattrop_parallel(call_frame_t *frame, xlator_t *subvol, int child_index, + loc_t *loc, gf_xattrop_flags_t flags, dict_t **dict, + dict_t *xdata) +{ + gf_msg_debug("EC", 0, "WIND: on child %d ", child_index); + STACK_WIND_COOKIE( + frame, cluster_xattrop_cbk, (void *)(uintptr_t)child_index, subvol, + subvol->fops->xattrop, loc, flags, dict[child_index], xdata); +} + +int32_t +ec_heal_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + ec_heal_t *heal = fop->data; + + ec_trace("WRITE_CBK", cookie, "ret=%d, errno=%d", op_ret, op_errno); + + gf_msg_debug(fop->xl->name, 0, + "%s: write op_ret %d, op_errno %s" + " at %" PRIu64, + uuid_utoa(heal->fd->inode->gfid), op_ret, strerror(op_errno), + heal->offset); + + ec_heal_update(cookie, 0); + + return 0; +} + +int32_t +ec_heal_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + ec_heal_t *heal = fop->data; + + ec_trace("READ_CBK", fop, "ret=%d, errno=%d", op_ret, op_errno); + + ec_heal_avoid(fop); + + if (op_ret > 0) { + gf_msg_debug(fop->xl->name, 0, + "%s: read succeeded, proceeding " + "to write at %" PRIu64, + uuid_utoa(heal->fd->inode->gfid), heal->offset); + ec_writev(heal->fop->frame, heal->xl, heal->bad, EC_MINIMUM_ONE, + ec_heal_writev_cbk, heal, heal->fd, vector, count, + heal->offset, 0, iobref, NULL); + } else { + if (op_ret < 0) { + gf_msg_debug(fop->xl->name, 0, + "%s: read failed %s, failing " + "to heal block at %" PRIu64, + uuid_utoa(heal->fd->inode->gfid), strerror(op_errno), + heal->offset); + heal->bad = 0; + } + heal->done = 1; + } + + return 0; +} + +void +ec_heal_data_block(ec_heal_t *heal) +{ + ec_trace("DATA", heal->fop, "good=%lX, bad=%lX", heal->good, heal->bad); + + if ((heal->good != 0) && (heal->bad != 0) && + (heal->iatt.ia_type == IA_IFREG)) { + ec_readv(heal->fop->frame, heal->xl, heal->good, EC_MINIMUM_MIN, + ec_heal_readv_cbk, heal, heal->fd, heal->size, heal->offset, 0, + NULL); + } +} + +/* FOP: fheal */ + +void +ec_fheal(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fheal_cbk_t func, void *data, fd_t *fd, + int32_t partial, dict_t *xdata) +{ + ec_fd_t *ctx = ec_fd_get(fd, this); + + if (ctx != NULL) { + gf_msg_trace("ec", 0, "FHEAL ctx: flags=%X, open=%" PRIXPTR, ctx->flags, + ctx->open); + ec_heal(frame, this, target, fop_flags, func, data, &ctx->loc, partial, + xdata); + } +} + +/* Common heal code */ +void +ec_mask_to_char_array(uintptr_t mask, unsigned char *array, int numsubvols) +{ + int i = 0; + + for (i = 0; i < numsubvols; i++) + array[i] = ((mask >> i) & 1); +} + +uintptr_t +ec_char_array_to_mask(unsigned char *array, int numsubvols) +{ + int i = 0; + uintptr_t mask = 0; + + if (array == NULL) + goto out; + + for (i = 0; i < numsubvols; i++) + if (array[i]) + mask |= (1ULL << i); +out: + return mask; +} + +int +ec_heal_entry_find_direction(ec_t *ec, default_args_cbk_t *replies, + uint64_t *versions, uint64_t *dirty, + unsigned char *sources, + unsigned char *healed_sinks) +{ + uint64_t xattr[EC_VERSION_SIZE] = {0}; + int source = -1; + uint64_t max_version = 0; + int ret = 0; + int i = 0; + + for (i = 0; i < ec->nodes; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret == -1) + continue; + + if (source == -1) + source = i; + + ret = ec_dict_get_array(replies[i].xdata, EC_XATTR_VERSION, xattr, + EC_VERSION_SIZE); + if (ret == 0) { + versions[i] = xattr[EC_DATA_TXN]; + if (max_version < versions[i]) { + max_version = versions[i]; + source = i; + } + } + + memset(xattr, 0, sizeof(xattr)); + ret = ec_dict_get_array(replies[i].xdata, EC_XATTR_DIRTY, xattr, + EC_VERSION_SIZE); + if (ret == 0) { + dirty[i] = xattr[EC_DATA_TXN]; + } + } + + if (source < 0) + goto out; + + for (i = 0; i < ec->nodes; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret == -1) + continue; + + if (versions[i] == versions[source]) + sources[i] = 1; + else + healed_sinks[i] = 1; + } + +out: + return source; +} + +int +ec_adjust_versions(call_frame_t *frame, ec_t *ec, ec_txn_t type, inode_t *inode, + int source, unsigned char *sources, + unsigned char *healed_sinks, uint64_t *versions, + uint64_t *dirty) +{ + int i = 0; + int ret = 0; + int call_count = 0; + dict_t **xattr = NULL; + int op_ret = 0; + loc_t loc = {0}; + gf_boolean_t erase_dirty = _gf_false; + uint64_t *versions_xattr = NULL; + uint64_t *dirty_xattr = NULL; + uint64_t allzero[2] = {0}; + unsigned char *on = NULL; + unsigned char *output = NULL; + default_args_cbk_t *replies = NULL; + + /* Allocate the required memory */ + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + on = alloca0(ec->nodes); + output = alloca0(ec->nodes); + EC_REPLIES_ALLOC(replies, ec->nodes); + xattr = GF_CALLOC(ec->nodes, sizeof(*xattr), gf_common_mt_pointer); + if (!xattr) { + op_ret = -ENOMEM; + goto out; + } + for (i = 0; i < ec->nodes; i++) { + xattr[i] = dict_new(); + if (!xattr[i]) { + op_ret = -ENOMEM; + goto out; + } + } + + /* dirty xattr represents if the file/dir needs heal. Unless all the + * copies are healed, don't erase it */ + if (EC_COUNT(sources, ec->nodes) + EC_COUNT(healed_sinks, ec->nodes) == + ec->nodes) + erase_dirty = _gf_true; + else + op_ret = -ENOTCONN; + + /* Populate the xattr array */ + for (i = 0; i < ec->nodes; i++) { + if (!sources[i] && !healed_sinks[i]) + continue; + versions_xattr = GF_CALLOC(EC_VERSION_SIZE, sizeof(*versions_xattr), + gf_common_mt_pointer); + if (!versions_xattr) { + op_ret = -ENOMEM; + continue; + } + + versions_xattr[type] = hton64(versions[source] - versions[i]); + ret = dict_set_bin(xattr[i], EC_XATTR_VERSION, versions_xattr, + (sizeof(*versions_xattr) * EC_VERSION_SIZE)); + if (ret < 0) { + op_ret = -ENOMEM; + continue; + } + + if (erase_dirty) { + dirty_xattr = GF_CALLOC(EC_VERSION_SIZE, sizeof(*dirty_xattr), + gf_common_mt_pointer); + if (!dirty_xattr) { + op_ret = -ENOMEM; + continue; + } + + dirty_xattr[type] = hton64(-dirty[i]); + ret = dict_set_bin(xattr[i], EC_XATTR_DIRTY, dirty_xattr, + (sizeof(*dirty_xattr) * EC_VERSION_SIZE)); + if (ret < 0) { + op_ret = -ENOMEM; + continue; + } + } + + if (memcmp(versions_xattr, allzero, + (sizeof(*versions_xattr) * EC_VERSION_SIZE)) == 0) { + if (!erase_dirty) { + continue; + } + + if (memcmp(dirty_xattr, allzero, + (sizeof(*dirty_xattr) * EC_VERSION_SIZE)) == 0) { + continue; + } + } + + on[i] = 1; + call_count++; + } + + /* Update the bricks with xattr */ + if (call_count) { + PARALLEL_FOP_ONLIST(ec->xl_list, on, ec->nodes, replies, frame, + ec_wind_xattrop_parallel, &loc, + GF_XATTROP_ADD_ARRAY64, xattr, NULL); + ret = cluster_fop_success_fill(replies, ec->nodes, output); + } + + if (ret < call_count) { + op_ret = -ENOTCONN; + goto out; + } + +out: + /* Cleanup */ + if (xattr) { + for (i = 0; i < ec->nodes; i++) { + if (xattr[i]) + dict_unref(xattr[i]); + } + GF_FREE(xattr); + } + cluster_replies_wipe(replies, ec->nodes); + loc_wipe(&loc); + return op_ret; +} + +int +ec_heal_metadata_find_direction(ec_t *ec, default_args_cbk_t *replies, + uint64_t *versions, uint64_t *dirty, + unsigned char *sources, + unsigned char *healed_sinks) +{ + uint64_t xattr[EC_VERSION_SIZE] = {0}; + uint64_t max_version = 0; + int same_count = 0; + int max_same_count = 0; + int same_source = -1; + int ret = 0; + int i = 0; + int j = 0; + int *groups = NULL; + struct iatt source_ia = {0}; + struct iatt child_ia = {0}; + + groups = alloca0(ec->nodes * sizeof(*groups)); + for (i = 0; i < ec->nodes; i++) + groups[i] = -1; + + for (i = 0; i < ec->nodes; i++) { + if (!replies[i].valid) + continue; + if (replies[i].op_ret < 0) + continue; + ret = ec_dict_get_array(replies[i].xdata, EC_XATTR_VERSION, xattr, + EC_VERSION_SIZE); + if (ret == 0) { + versions[i] = xattr[EC_METADATA_TXN]; + } + + memset(xattr, 0, sizeof(xattr)); + ret = ec_dict_get_array(replies[i].xdata, EC_XATTR_DIRTY, xattr, + EC_VERSION_SIZE); + if (ret == 0) { + dirty[i] = xattr[EC_METADATA_TXN]; + } + if (groups[i] >= 0) /*Already part of group*/ + continue; + groups[i] = i; + same_count = 1; + source_ia = replies[i].stat; + for (j = i + 1; j < ec->nodes; j++) { + if (!replies[j].valid || replies[j].op_ret < 0) + continue; + child_ia = replies[j].stat; + if (!IA_EQUAL(source_ia, child_ia, gfid) || + !IA_EQUAL(source_ia, child_ia, type) || + !IA_EQUAL(source_ia, child_ia, prot) || + !IA_EQUAL(source_ia, child_ia, uid) || + !IA_EQUAL(source_ia, child_ia, gid)) + continue; + if (!are_dicts_equal(replies[i].xdata, replies[j].xdata, + ec_sh_key_match, NULL)) + continue; + groups[j] = i; + same_count++; + } + + if (max_same_count < same_count) { + max_same_count = same_count; + same_source = i; + } + } + + if (max_same_count < ec->fragments) { + ret = -EIO; + goto out; + } + + for (i = 0; i < ec->nodes; i++) { + if (groups[i] == groups[same_source]) + sources[i] = 1; + else if (replies[i].valid && replies[i].op_ret >= 0) + healed_sinks[i] = 1; + } + for (i = 0; i < ec->nodes; i++) { + if (sources[i] && (versions[i] > max_version)) { + same_source = i; + max_version = versions[i]; + } + } + ret = same_source; +out: + return ret; +} + +int +__ec_heal_metadata_prepare(call_frame_t *frame, ec_t *ec, inode_t *inode, + unsigned char *locked_on, + default_args_cbk_t *replies, uint64_t *versions, + uint64_t *dirty, unsigned char *sources, + unsigned char *healed_sinks) +{ + loc_t loc = {0}; + unsigned char *output = NULL; + unsigned char *lookup_on = NULL; + int ret = 0; + int source = 0; + default_args_cbk_t *greplies = NULL; + int i = 0; + EC_REPLIES_ALLOC(greplies, ec->nodes); + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + output = alloca0(ec->nodes); + lookup_on = alloca0(ec->nodes); + ret = cluster_lookup(ec->xl_list, locked_on, ec->nodes, replies, output, + frame, ec->xl, &loc, NULL); + if (ret <= ec->fragments) { + ret = -ENOTCONN; + goto out; + } + + memcpy(lookup_on, output, ec->nodes); + /*Use getxattr to get the filtered xattrs which filter internal xattrs*/ + ret = cluster_getxattr(ec->xl_list, lookup_on, ec->nodes, greplies, output, + frame, ec->xl, &loc, NULL, NULL); + for (i = 0; i < ec->nodes; i++) { + if (lookup_on[i] && !output[i]) { + replies[i].valid = 0; + continue; + } + if (replies[i].xdata) { + dict_unref(replies[i].xdata); + replies[i].xdata = NULL; + if (greplies[i].xattr) + replies[i].xdata = dict_ref(greplies[i].xattr); + } + } + + source = ec_heal_metadata_find_direction(ec, replies, versions, dirty, + sources, healed_sinks); + if (source < 0) { + ret = -EIO; + goto out; + } + ret = source; +out: + cluster_replies_wipe(greplies, ec->nodes); + loc_wipe(&loc); + return ret; +} + +/* Metadata heal */ +int +__ec_removexattr_sinks(call_frame_t *frame, ec_t *ec, inode_t *inode, + int source, unsigned char *sources, + unsigned char *healed_sinks, default_args_cbk_t *replies) +{ + int i = 0; + int ret = 0; + loc_t loc = {0}; + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + for (i = 0; i < ec->nodes; i++) { + if (i == source) + continue; + if (!sources[i] && !healed_sinks[i]) + continue; + ret = dict_foreach(replies[i].xdata, ec_heal_xattr_clean, + replies[source].xdata); + if (ret < 0) { + sources[i] = 0; + healed_sinks[i] = 0; + continue; + } + + if (replies[i].xdata->count == 0) { + continue; + } else if (sources[i]) { + /* This can happen if setxattr/removexattr succeeds on + * the bricks but fails to update the version. This + * will make sure that the xattrs are made equal after + * heal*/ + sources[i] = 0; + healed_sinks[i] = 1; + } + + ret = syncop_removexattr(ec->xl_list[i], &loc, "", replies[i].xdata, + NULL); + if (ret < 0) + healed_sinks[i] = 0; + } + + loc_wipe(&loc); + if (EC_COUNT(healed_sinks, ec->nodes) == 0) + return -ENOTCONN; + return 0; +} + +int +__ec_heal_metadata(call_frame_t *frame, ec_t *ec, inode_t *inode, + unsigned char *locked_on, unsigned char *sources, + unsigned char *healed_sinks) +{ + loc_t loc = {0}; + int ret = 0; + int source = 0; + default_args_cbk_t *replies = NULL; + default_args_cbk_t *sreplies = NULL; + uint64_t *versions = NULL; + uint64_t *dirty = NULL; + unsigned char *output = NULL; + dict_t *source_dict = NULL; + struct iatt source_buf = {0}; + + EC_REPLIES_ALLOC(replies, ec->nodes); + EC_REPLIES_ALLOC(sreplies, ec->nodes); + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + output = alloca0(ec->nodes); + versions = alloca0(ec->nodes * sizeof(*versions)); + dirty = alloca0(ec->nodes * sizeof(*dirty)); + source = __ec_heal_metadata_prepare(frame, ec, inode, locked_on, replies, + versions, dirty, sources, healed_sinks); + if (source < 0) { + ret = -EIO; + goto out; + } + + if ((EC_COUNT(sources, ec->nodes) == ec->nodes) || + (EC_COUNT(healed_sinks, ec->nodes) == 0)) { + ret = 0; + goto erase_dirty; + } + + source_buf = replies[source].stat; + ret = cluster_setattr(ec->xl_list, healed_sinks, ec->nodes, sreplies, + output, frame, ec->xl, &loc, &source_buf, + GF_SET_ATTR_MODE | GF_SET_ATTR_UID | GF_SET_ATTR_GID, + NULL); + /*In case the operation fails on some of the subvols*/ + memcpy(healed_sinks, output, ec->nodes); + if (EC_COUNT(healed_sinks, ec->nodes) == 0) { + ret = -ENOTCONN; + goto out; + } + + ret = __ec_removexattr_sinks(frame, ec, inode, source, sources, + healed_sinks, replies); + if (ret < 0) + goto out; + + source_dict = dict_ref(replies[source].xdata); + if (dict_foreach_match(source_dict, ec_ignorable_key_match, NULL, + dict_remove_foreach_fn, NULL) == -1) { + ret = -ENOMEM; + goto out; + } + + ret = cluster_setxattr(ec->xl_list, healed_sinks, ec->nodes, replies, + output, frame, ec->xl, &loc, source_dict, 0, NULL); + + EC_INTERSECT(healed_sinks, healed_sinks, output, ec->nodes); + if (EC_COUNT(healed_sinks, ec->nodes) == 0) { + ret = -ENOTCONN; + goto out; + } + +erase_dirty: + ret = ec_adjust_versions(frame, ec, EC_METADATA_TXN, inode, source, sources, + healed_sinks, versions, dirty); +out: + if (source_dict) + dict_unref(source_dict); + + loc_wipe(&loc); + cluster_replies_wipe(replies, ec->nodes); + cluster_replies_wipe(sreplies, ec->nodes); + return ret; +} + +int +ec_heal_metadata(call_frame_t *frame, ec_t *ec, inode_t *inode, + unsigned char *sources, unsigned char *healed_sinks) +{ + unsigned char *locked_on = NULL; + unsigned char *up_subvols = NULL; + unsigned char *output = NULL; + int ret = 0; + default_args_cbk_t *replies = NULL; + + EC_REPLIES_ALLOC(replies, ec->nodes); + locked_on = alloca0(ec->nodes); + output = alloca0(ec->nodes); + up_subvols = alloca0(ec->nodes); + ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes); + ret = cluster_inodelk(ec->xl_list, up_subvols, ec->nodes, replies, + locked_on, frame, ec->xl, ec->xl->name, inode, 0, 0); + { + if (ret <= ec->fragments) { + gf_msg_debug(ec->xl->name, 0, + "%s: Skipping heal " + "as only %d number of subvolumes could " + "be locked", + uuid_utoa(inode->gfid), ret); + ret = -ENOTCONN; + goto unlock; + } + ret = __ec_heal_metadata(frame, ec, inode, locked_on, sources, + healed_sinks); + } +unlock: + cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame, + ec->xl, ec->xl->name, inode, 0, 0); + cluster_replies_wipe(replies, ec->nodes); + return ret; +} + +/*entry heal*/ +int +__ec_heal_entry_prepare(call_frame_t *frame, ec_t *ec, inode_t *inode, + unsigned char *locked_on, uint64_t *versions, + uint64_t *dirty, unsigned char *sources, + unsigned char *healed_sinks) +{ + loc_t loc = {0}; + int source = 0; + int ret = 0; + default_args_cbk_t *replies = NULL; + unsigned char *output = NULL; + dict_t *xdata = NULL; + + EC_REPLIES_ALLOC(replies, ec->nodes); + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + xdata = dict_new(); + if (!xdata) { + ret = -ENOMEM; + goto out; + } + + if (dict_set_uint64(xdata, EC_XATTR_VERSION, 0) || + dict_set_uint64(xdata, EC_XATTR_DIRTY, 0)) { + ret = -ENOMEM; + goto out; + } + + output = alloca0(ec->nodes); + ret = cluster_lookup(ec->xl_list, locked_on, ec->nodes, replies, output, + frame, ec->xl, &loc, xdata); + if (ret <= ec->fragments) { + ret = -ENOTCONN; + goto out; + } + + source = ec_heal_entry_find_direction(ec, replies, versions, dirty, sources, + healed_sinks); + if (source < 0) { + ret = -EIO; + goto out; + } + ret = source; +out: + if (xdata) + dict_unref(xdata); + loc_wipe(&loc); + cluster_replies_wipe(replies, ec->nodes); + return ret; +} +int32_t +ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia, + call_frame_t *frame, xlator_t *this, unsigned char *on) +{ + dict_t *xattr = NULL; + int32_t ret = -1; + default_args_cbk_t *replies = NULL; + unsigned char *output = NULL; + uint64_t dirty[EC_VERSION_SIZE] = {1, 1}; + loc_t newloc = {0}; + + /*Symlinks don't have any data to be healed*/ + if (ia->ia_type == IA_IFLNK) + dirty[EC_DATA_TXN] = 0; + + newloc.inode = inode_ref(loc->inode); + gf_uuid_copy(newloc.gfid, ia->ia_gfid); + EC_REPLIES_ALLOC(replies, ec->nodes); + output = alloca0(ec->nodes); + xattr = dict_new(); + if (!xattr) { + ret = -ENOMEM; + goto out; + } + + ret = ec_dict_set_array(xattr, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE); + if (ret) + goto out; + + ret = cluster_xattrop(ec->xl_list, on, ec->nodes, replies, output, frame, + ec->xl, &newloc, GF_XATTROP_ADD_ARRAY64, xattr, NULL); + + if (ret < ec->fragments) { + ret = -ENOTCONN; + goto out; + } + +out: + if (xattr) + dict_unref(xattr); + cluster_replies_wipe(replies, ec->nodes); + loc_wipe(&newloc); + return ret; +} + +/*Name heal*/ +int +ec_delete_stale_name(dict_t *gfid_db, char *key, data_t *d, void *data) +{ + struct ec_name_data *name_data = data; + struct iatt *ia = NULL; + ec_t *ec = NULL; + loc_t loc = {0}; + unsigned char *same = data_to_bin(d); + default_args_cbk_t *replies = NULL; + unsigned char *output = NULL; + int ret = 0; + int estale_count = 0; + int i = 0; + call_frame_t *frame = name_data->frame; + uuid_t gfid; + + ec = name_data->frame->this->private; + EC_REPLIES_ALLOC(replies, ec->nodes); + if (EC_COUNT(same, ec->nodes) >= ec->fragments) { + ret = 0; + goto out; + } + + loc.parent = inode_ref(name_data->parent); + loc.inode = inode_new(name_data->parent->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + + gf_uuid_parse(key, gfid); + gf_uuid_copy(loc.pargfid, name_data->parent->gfid); + loc.name = name_data->name; + output = alloca0(ec->nodes); + ret = cluster_lookup(ec->xl_list, name_data->participants, ec->nodes, + replies, output, name_data->frame, ec->xl, &loc, NULL); + + for (i = 0; i < ec->nodes; i++) { + if (!replies[i].valid) + continue; + if (replies[i].op_ret == -1) { + if (replies[i].op_errno == ESTALE || replies[i].op_errno == ENOENT) + estale_count++; + else + name_data->participants[i] = 0; + } else if (gf_uuid_compare(gfid, replies[i].stat.ia_gfid)) { + estale_count++; + gf_msg_debug(ec->xl->name, 0, "%s/%s: different gfid as %s", + uuid_utoa(name_data->parent->gfid), name_data->name, + key); + } + } + + if (estale_count <= ec->redundancy) { + /* We have at least ec->fragments number of fragments, so the + * file is recoverable, so don't delete it*/ + + /* Please note that the lookup call above could fail with + * ENOTCONN on all subvoumes and still this branch will be + * true, but in those cases conservatively we decide to not + * delete the file until we are sure*/ + ret = 0; + goto out; + } + + /*Noway to recover, delete the name*/ + loc_wipe(&loc); + loc.parent = inode_ref(name_data->parent); + gf_uuid_copy(loc.pargfid, loc.parent->gfid); + loc.name = name_data->name; + for (i = 0; i < ec->nodes; i++) { + if (same[i] && replies[i].valid && (replies[i].op_ret == 0)) { + ia = &replies[i].stat; + break; + } + } + + if (!ia) { + ret = -ENOTCONN; + goto out; + } + + if (IA_ISDIR(ia->ia_type)) { + ret = cluster_rmdir(ec->xl_list, same, ec->nodes, replies, output, + frame, ec->xl, &loc, 1, NULL); + gf_msg_debug(ec->xl->name, 0, + "cluster rmdir succeeded on %d " + "nodes", + ret); + } else { + ret = cluster_unlink(ec->xl_list, same, ec->nodes, replies, output, + frame, ec->xl, &loc, 0, NULL); + gf_msg_debug(ec->xl->name, 0, + "cluster unlink succeeded on %d " + "nodes", + ret); + } + + for (i = 0; i < ec->nodes; i++) { + if (output[i]) { + same[i] = 0; + name_data->enoent[i] = 1; + } else { + /*op failed*/ + if (same[i]) + name_data->participants[i] = 0; + } + } + ret = 0; + /*This will help in making decisions about creating names*/ + dict_del(gfid_db, key); +out: + if (ret < 0) { + gf_msg_debug(ec->xl->name, 0, "%s/%s: heal failed %s", + uuid_utoa(name_data->parent->gfid), name_data->name, + strerror(-ret)); + } + cluster_replies_wipe(replies, ec->nodes); + loc_wipe(&loc); + return ret; +} + +int +ec_delete_stale_names(call_frame_t *frame, ec_t *ec, inode_t *parent, + char *name, default_args_cbk_t *replies, dict_t *gfid_db, + unsigned char *enoent, unsigned char *gfidless, + unsigned char *participants) +{ + struct ec_name_data name_data = {0}; + + name_data.enoent = enoent; + name_data.gfidless = gfidless; + name_data.participants = participants; + name_data.name = name; + name_data.parent = parent; + name_data.frame = frame; + name_data.replies = replies; + return dict_foreach(gfid_db, ec_delete_stale_name, &name_data); +} + +int +_assign_same(dict_t *dict, char *key, data_t *value, void *data) +{ + struct ec_name_data *name_data = data; + + name_data->same = data_to_bin(value); + return 0; +} + +int +ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name, + default_args_cbk_t *lookup_replies, dict_t *gfid_db, + unsigned char *enoent, unsigned char *participants) +{ + int ret = 0; + int i = 0; + struct ec_name_data name_data = {0}; + struct iatt *ia = NULL; + unsigned char *output = 0; + unsigned char *output1 = 0; + unsigned char *on = NULL; + default_args_cbk_t *replies = NULL; + loc_t loc = {0}; + loc_t srcloc = {0}; + unsigned char *link = NULL; + unsigned char *create = NULL; + dict_t *xdata = NULL; + char *linkname = NULL; + ec_config_t config; + + /* There should be just one gfid key */ + EC_REPLIES_ALLOC(replies, ec->nodes); + if (gfid_db->count != 1) { + ret = -EINVAL; + goto out; + } + + ret = dict_foreach(gfid_db, _assign_same, &name_data); + if (ret < 0) + goto out; + /*There should at least be one valid success reply with gfid*/ + for (i = 0; i < ec->nodes; i++) + if (name_data.same[i]) + break; + + if (i == ec->nodes) { + ret = -EINVAL; + goto out; + } + + ia = &lookup_replies[i].stat; + xdata = dict_new(); + loc.parent = inode_ref(parent); + gf_uuid_copy(loc.pargfid, parent->gfid); + loc.inode = inode_new(parent->table); + if (loc.inode) + srcloc.inode = inode_ref(loc.inode); + gf_uuid_copy(srcloc.gfid, ia->ia_gfid); + if (!loc.inode || !xdata || + dict_set_static_bin(xdata, "gfid-req", ia->ia_gfid, + sizeof(ia->ia_gfid))) { + ret = -ENOMEM; + goto out; + } + loc.name = name; + link = alloca0(ec->nodes); + create = alloca0(ec->nodes); + on = alloca0(ec->nodes); + output = alloca0(ec->nodes); + output1 = alloca0(ec->nodes); + + for (i = 0; i < ec->nodes; i++) { + if (!lookup_replies[i].valid) + continue; + if (lookup_replies[i].op_ret) + continue; + on[i] = 1; + } + switch (ia->ia_type) { + case IA_IFDIR: + ec_set_new_entry_dirty(ec, &loc, ia, frame, ec->xl, on); + (void)cluster_mkdir( + ec->xl_list, enoent, ec->nodes, replies, output, frame, ec->xl, + &loc, st_mode_from_ia(ia->ia_prot, ia->ia_type), 0, xdata); + break; + + case IA_IFLNK: + /*Check for hard links and create/link*/ + ret = cluster_lookup(ec->xl_list, enoent, ec->nodes, replies, + output, frame, ec->xl, &srcloc, NULL); + for (i = 0; i < ec->nodes; i++) { + if (output[i]) { + link[i] = 1; + } else { + if (replies[i].op_errno == ENOENT || + replies[i].op_errno == ESTALE) { + create[i] = 1; + } + } + } + + if (EC_COUNT(link, ec->nodes)) { + cluster_link(ec->xl_list, link, ec->nodes, replies, output1, + frame, ec->xl, &srcloc, &loc, NULL); + } + + if (EC_COUNT(create, ec->nodes)) { + cluster_readlink(ec->xl_list, name_data.same, ec->nodes, + replies, output, frame, ec->xl, &srcloc, 4096, + NULL); + if (EC_COUNT(output, ec->nodes) == 0) { + ret = -ENOTCONN; + goto out; + } + + for (i = 0; i < ec->nodes; i++) { + if (output[i]) + break; + } + linkname = alloca0(strlen(replies[i].buf) + 1); + strcpy(linkname, replies[i].buf); + ec_set_new_entry_dirty(ec, &loc, ia, frame, ec->xl, on); + cluster_symlink(ec->xl_list, create, ec->nodes, replies, output, + frame, ec->xl, linkname, &loc, 0, xdata); + } + for (i = 0; i < ec->nodes; i++) + if (output1[i]) + output[i] = 1; + break; + case IA_IFREG: + ec_set_new_entry_dirty(ec, &loc, ia, frame, ec->xl, on); + config.version = EC_CONFIG_VERSION; + config.algorithm = EC_CONFIG_ALGORITHM; + config.gf_word_size = EC_GF_BITS; + config.bricks = ec->nodes; + config.redundancy = ec->redundancy; + config.chunk_size = EC_METHOD_CHUNK_SIZE; + + ret = ec_dict_set_config(xdata, EC_XATTR_CONFIG, &config); + if (ret != 0) { + goto out; + } + + /* Fall through */ + + default: + ret = dict_set_int32(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1); + if (ret) + goto out; + ret = cluster_mknod( + ec->xl_list, enoent, ec->nodes, replies, output, frame, ec->xl, + &loc, st_mode_from_ia(ia->ia_prot, ia->ia_type), + makedev(ia_major(ia->ia_rdev), ia_minor(ia->ia_rdev)), 0, + xdata); + break; + } + + for (i = 0; i < ec->nodes; i++) { + if (enoent[i] && !output[i]) + participants[i] = 0; + } + + ret = 0; +out: + if (ret < 0) + gf_msg_debug(ec->xl->name, 0, "%s/%s: heal failed %s", + uuid_utoa(parent->gfid), name, strerror(-ret)); + cluster_replies_wipe(replies, ec->nodes); + loc_wipe(&loc); + loc_wipe(&srcloc); + if (xdata) + dict_unref(xdata); + return ret; +} + +int +__ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name, + unsigned char *participants) +{ + unsigned char *output = NULL; + unsigned char *enoent = NULL; + default_args_cbk_t *replies = NULL; + dict_t *xdata = NULL; + dict_t *gfid_db = NULL; + int ret = 0; + loc_t loc = {0}; + int i = 0; + struct iatt *ia = NULL; + char gfid[64] = {0}; + unsigned char *same = NULL; + unsigned char *gfidless = NULL; + + EC_REPLIES_ALLOC(replies, ec->nodes); + loc.parent = inode_ref(parent); + loc.inode = inode_new(parent->table); + gf_uuid_copy(loc.pargfid, parent->gfid); + loc.name = name; + xdata = dict_new(); + gfid_db = dict_new(); + if (!xdata || !gfid_db || !loc.inode) { + ret = -ENOMEM; + goto out; + } + + ret = dict_set_int32(xdata, GF_GFIDLESS_LOOKUP, 1); + if (ret) { + ret = -ENOMEM; + goto out; + } + + output = alloca0(ec->nodes); + gfidless = alloca0(ec->nodes); + enoent = alloca0(ec->nodes); + ret = cluster_lookup(ec->xl_list, participants, ec->nodes, replies, output, + frame, ec->xl, &loc, NULL); + for (i = 0; i < ec->nodes; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret == -1) { + /*If ESTALE comes here, that means parent dir is not + * present, nothing to do there, so reset participants + * for that brick*/ + if (replies[i].op_errno == ENOENT) + enoent[i] = 1; + else + participants[i] = 0; + continue; + } + ia = &replies[i].stat; + if (gf_uuid_is_null(ia->ia_gfid)) { + if (IA_ISDIR(ia->ia_type) || ia->ia_size == 0) + gfidless[i] = 1; + else + participants[i] = 0; + } else { + uuid_utoa_r(ia->ia_gfid, gfid); + ret = dict_get_bin(gfid_db, gfid, (void **)&same); + if (ret < 0) { + same = alloca0(ec->nodes); + } + same[i] = 1; + if (ret < 0) { + ret = dict_set_static_bin(gfid_db, gfid, same, ec->nodes); + } + if (ret < 0) + goto out; + } + } + + ret = ec_delete_stale_names(frame, ec, parent, name, replies, gfid_db, + enoent, gfidless, participants); + + if (gfid_db->count == 0) { + /* All entries seem to be stale entries and deleted, + * nothing more to do.*/ + goto out; + } + + if (gfid_db->count > 1) { + gf_msg(ec->xl->name, GF_LOG_INFO, 0, EC_MSG_HEAL_FAIL, + "%s/%s: Not able to heal", uuid_utoa(parent->gfid), name); + memset(participants, 0, ec->nodes); + goto out; + } + + EC_INTERSECT(enoent, enoent, participants, ec->nodes); + if (EC_COUNT(enoent, ec->nodes) == 0) { + ret = 0; + goto out; + } + + ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent, + participants); + if (ret >= 0) { + /* If ec_create_name() succeeded we return 1 to indicate that a new + * file has been created and it will need to be healed. */ + ret = 1; + } +out: + cluster_replies_wipe(replies, ec->nodes); + loc_wipe(&loc); + if (xdata) + dict_unref(xdata); + if (gfid_db) + dict_unref(gfid_db); + return ret; +} + +int +ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name, + unsigned char *participants) +{ + int ret = 0; + default_args_cbk_t *replies = NULL; + unsigned char *output = NULL; + unsigned char *locked_on = NULL; + loc_t loc = {0}; + + loc.parent = inode_ref(parent); + loc.name = name; + loc.inode = inode_new(parent->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + + EC_REPLIES_ALLOC(replies, ec->nodes); + output = alloca0(ec->nodes); + locked_on = alloca0(ec->nodes); + ret = cluster_inodelk(ec->xl_list, participants, ec->nodes, replies, + locked_on, frame, ec->xl, ec->xl->name, parent, 0, 0); + { + if (ret <= ec->fragments) { + gf_msg_debug(ec->xl->name, 0, + "%s/%s: Skipping " + "heal as only %d number of subvolumes could " + "be locked", + uuid_utoa(parent->gfid), name, ret); + ret = -ENOTCONN; + goto unlock; + } + EC_INTERSECT(participants, participants, locked_on, ec->nodes); + ret = __ec_heal_name(frame, ec, parent, name, participants); + } +unlock: + cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame, + ec->xl, ec->xl->name, parent, 0, 0); +out: + cluster_replies_wipe(replies, ec->nodes); + loc_wipe(&loc); + return ret; +} + +int +ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) +{ + struct ec_name_data *name_data = data; + xlator_t *this = THIS; + ec_t *ec = this->private; + unsigned char *name_on = alloca0(ec->nodes); + int i = 0; + int ret = 0; + + if (ec->shutdown) { + gf_msg_debug(this->name, 0, + "Cancelling directory heal " + "because EC is stopping."); + return -ENOTCONN; + } + + memcpy(name_on, name_data->participants, ec->nodes); + ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name, + name_on); + + if (ret < 0) { + memset(name_on, 0, ec->nodes); + } else { + name_data->heal_pending += ret; + } + + for (i = 0; i < ec->nodes; i++) + if (name_data->participants[i] && !name_on[i]) + name_data->failed_on[i] = 1; + + return 0; +} + +int +ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode, + unsigned char *participants, uint32_t *pending) +{ + int i = 0; + int j = 0; + loc_t loc = {0}; + struct ec_name_data name_data = {0}; + int ret = 0; + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + name_data.frame = frame; + name_data.participants = participants; + name_data.failed_on = alloca0(ec->nodes); + name_data.heal_pending = 0; + + for (i = 0; i < ec->nodes; i++) { + if (!participants[i]) + continue; + ret = syncop_dir_scan(ec->xl_list[i], &loc, GF_CLIENT_PID_SELF_HEALD, + &name_data, ec_name_heal_handler); + if (ret < 0) { + break; + } + for (j = 0; j < ec->nodes; j++) + if (name_data.failed_on[j]) + participants[j] = 0; + + if (EC_COUNT(participants, ec->nodes) <= ec->fragments) { + ret = -ENOTCONN; + break; + } + } + *pending += name_data.heal_pending; + + loc_wipe(&loc); + return ret; +} + +int +__ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode, + unsigned char *heal_on, unsigned char *sources, + unsigned char *healed_sinks, uint32_t *pending) +{ + unsigned char *locked_on = NULL; + unsigned char *output = NULL; + uint64_t *versions = NULL; + uint64_t *dirty = NULL; + unsigned char *participants = NULL; + default_args_cbk_t *replies = NULL; + int ret = 0; + int source = 0; + int i = 0; + + locked_on = alloca0(ec->nodes); + output = alloca0(ec->nodes); + versions = alloca0(ec->nodes * sizeof(*versions)); + dirty = alloca0(ec->nodes * sizeof(*dirty)); + + EC_REPLIES_ALLOC(replies, ec->nodes); + ret = cluster_inodelk(ec->xl_list, heal_on, ec->nodes, replies, locked_on, + frame, ec->xl, ec->xl->name, inode, 0, 0); + { + if (ret <= ec->fragments) { + gf_msg_debug(ec->xl->name, 0, + "%s: Skipping heal " + "as only %d number of subvolumes could " + "be locked", + uuid_utoa(inode->gfid), ret); + ret = -ENOTCONN; + goto unlock; + } + ret = __ec_heal_entry_prepare(frame, ec, inode, locked_on, versions, + dirty, sources, healed_sinks); + source = ret; + } +unlock: + cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame, + ec->xl, ec->xl->name, inode, 0, 0); + if (ret < 0) + goto out; + + participants = alloca0(ec->nodes); + for (i = 0; i < ec->nodes; i++) { + if (sources[i] || healed_sinks[i]) + participants[i] = 1; + } + ret = ec_heal_names(frame, ec, inode, participants, pending); + + if (EC_COUNT(participants, ec->nodes) <= ec->fragments) + goto out; + + for (i = 0; i < ec->nodes; i++) { + if (!participants[i]) { + sources[i] = 0; + healed_sinks[i] = 0; + } + } + + ec_adjust_versions(frame, ec, EC_DATA_TXN, inode, source, sources, + healed_sinks, versions, dirty); +out: + cluster_replies_wipe(replies, ec->nodes); + return ret; +} + +int +ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode, + unsigned char *sources, unsigned char *healed_sinks, + uint32_t *pending) +{ + unsigned char *locked_on = NULL; + unsigned char *up_subvols = NULL; + unsigned char *output = NULL; + char selfheal_domain[1024] = {0}; + int ret = 0; + default_args_cbk_t *replies = NULL; + + EC_REPLIES_ALLOC(replies, ec->nodes); + locked_on = alloca0(ec->nodes); + output = alloca0(ec->nodes); + up_subvols = alloca0(ec->nodes); + + sprintf(selfheal_domain, "%s:self-heal", ec->xl->name); + ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes); + /*If other processes are already doing the heal, don't block*/ + ret = cluster_tiebreaker_inodelk(ec->xl_list, up_subvols, ec->nodes, + replies, locked_on, frame, ec->xl, + selfheal_domain, inode, 0, 0); + { + if (ret <= ec->fragments) { + gf_msg_debug(ec->xl->name, 0, + "%s: Skipping heal " + "as only %d number of subvolumes could " + "be locked", + uuid_utoa(inode->gfid), ret); + ret = -ENOTCONN; + goto unlock; + } + ret = __ec_heal_entry(frame, ec, inode, locked_on, sources, + healed_sinks, pending); + } +unlock: + cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame, + ec->xl, selfheal_domain, inode, 0, 0); + cluster_replies_wipe(replies, ec->nodes); + return ret; +} + +/*Find direction for data heal and heal info*/ +int +ec_heal_data_find_direction(ec_t *ec, default_args_cbk_t *replies, + uint64_t *data_versions, uint64_t *dirty, + uint64_t *size, unsigned char *sources, + unsigned char *healed_sinks, + gf_boolean_t check_ondisksize, int which) +{ + uint64_t xattr[EC_VERSION_SIZE] = {0}; + char version_size[128] = {0}; + dict_t *version_size_db = NULL; + unsigned char *same = NULL; + int max_same_count = 0; + int source = 0; + int i = 0; + int ret = 0; + dict_t *dict = NULL; + uint64_t source_size = 0; + + version_size_db = dict_new(); + if (!version_size_db) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < ec->nodes; i++) { + if (!replies[i].valid) + continue; + if (replies[i].op_ret < 0) + continue; + dict = (which == EC_COMBINE_XDATA) ? replies[i].xdata + : replies[i].xattr; + + ret = ec_dict_get_array(dict, EC_XATTR_VERSION, xattr, EC_VERSION_SIZE); + if (ret == 0) { + data_versions[i] = xattr[EC_DATA_TXN]; + } + + memset(xattr, 0, sizeof(xattr)); + ret = ec_dict_get_array(dict, EC_XATTR_DIRTY, xattr, EC_VERSION_SIZE); + if (ret == 0) { + dirty[i] = xattr[EC_DATA_TXN]; + } + ret = ec_dict_del_number(dict, EC_XATTR_SIZE, &size[i]); + /*Build a db of same metadata and data version and size*/ + snprintf(version_size, sizeof(version_size), "%" PRIu64 "-%" PRIu64, + data_versions[i], size[i]); + + ret = dict_get_bin(version_size_db, version_size, (void **)&same); + if (ret < 0) { + same = alloca0(ec->nodes); + } + + same[i] = 1; + if (max_same_count < EC_COUNT(same, ec->nodes)) { + max_same_count = EC_COUNT(same, ec->nodes); + source = i; + } + + if (ret < 0) { + ret = dict_set_static_bin(version_size_db, version_size, same, + ec->nodes); + } + + if (ret < 0) { + ret = -ENOMEM; + goto out; + } + } + /* If we don't have ec->fragments number of same version,size it is not + * recoverable*/ + if (max_same_count < ec->fragments) { + ret = -EIO; + goto out; + } else { + snprintf(version_size, sizeof(version_size), "%" PRIu64 "-%" PRIu64, + data_versions[source], size[source]); + + ret = dict_get_bin(version_size_db, version_size, (void **)&same); + if (ret < 0) + goto out; + memcpy(sources, same, ec->nodes); + for (i = 0; i < ec->nodes; i++) { + if (replies[i].valid && (replies[i].op_ret == 0) && !sources[i]) + healed_sinks[i] = 1; + } + } + + /* There could be files with versions, size same but on disk ia_size + * could be different because of disk crashes, mark them as sinks as + * well*/ + + if (check_ondisksize) { + source_size = size[source]; + ec_adjust_size_up(ec, &source_size, _gf_true); + + for (i = 0; i < ec->nodes; i++) { + if (sources[i]) { + if (replies[i].stat.ia_size != source_size) { + sources[i] = 0; + healed_sinks[i] = 1; + max_same_count--; + } else { + source = i; + } + } + } + if (max_same_count < ec->fragments) { + ret = -EIO; + goto out; + } + } + + ret = source; +out: + if (version_size_db) + dict_unref(version_size_db); + return ret; +} + +int +__ec_heal_data_prepare(call_frame_t *frame, ec_t *ec, fd_t *fd, + unsigned char *locked_on, uint64_t *versions, + uint64_t *dirty, uint64_t *size, unsigned char *sources, + unsigned char *healed_sinks, unsigned char *trim, + struct iatt *stbuf) +{ + default_args_cbk_t *replies = NULL; + default_args_cbk_t *fstat_replies = NULL; + unsigned char *output = NULL; + unsigned char *fstat_output = NULL; + dict_t *xattrs = NULL; + uint64_t zero_array[2] = {0}; + int source = 0; + int ret = 0; + uint64_t zero_value = 0; + int i = 0; + + EC_REPLIES_ALLOC(replies, ec->nodes); + EC_REPLIES_ALLOC(fstat_replies, ec->nodes); + output = alloca0(ec->nodes); + fstat_output = alloca0(ec->nodes); + xattrs = dict_new(); + if (!xattrs || + dict_set_static_bin(xattrs, EC_XATTR_VERSION, zero_array, + sizeof(zero_array)) || + dict_set_static_bin(xattrs, EC_XATTR_DIRTY, zero_array, + sizeof(zero_array)) || + dict_set_static_bin(xattrs, EC_XATTR_SIZE, &zero_value, + sizeof(zero_value))) { + ret = -ENOMEM; + goto out; + } + + ret = cluster_fxattrop(ec->xl_list, locked_on, ec->nodes, replies, output, + frame, ec->xl, fd, GF_XATTROP_ADD_ARRAY64, xattrs, + NULL); + + ret = cluster_fstat(ec->xl_list, locked_on, ec->nodes, fstat_replies, + fstat_output, frame, ec->xl, fd, NULL); + + for (i = 0; i < ec->nodes; i++) { + output[i] = output[i] && fstat_output[i]; + replies[i].valid = output[i]; + if (output[i]) + replies[i].stat = fstat_replies[i].stat; + } + + if (EC_COUNT(output, ec->nodes) <= ec->fragments) { + ret = -ENOTCONN; + goto out; + } + + source = ec_heal_data_find_direction(ec, replies, versions, dirty, size, + sources, healed_sinks, _gf_true, + EC_COMBINE_DICT); + ret = source; + if (ret < 0) + goto out; + + if (stbuf) + *stbuf = replies[source].stat; + + for (i = 0; i < ec->nodes; i++) { + if (healed_sinks[i]) { + if (replies[i].stat.ia_size) + trim[i] = 1; + } + } + + if (EC_COUNT(sources, ec->nodes) < ec->fragments) { + ret = -ENOTCONN; + goto out; + } + + ret = source; +out: + if (xattrs) + dict_unref(xattrs); + cluster_replies_wipe(replies, ec->nodes); + cluster_replies_wipe(fstat_replies, ec->nodes); + if (ret < 0) { + gf_msg_debug(ec->xl->name, 0, "%s: heal failed %s", + uuid_utoa(fd->inode->gfid), strerror(-ret)); + } else { + gf_msg_debug(ec->xl->name, 0, + "%s: sources: %d, sinks: " + "%d", + uuid_utoa(fd->inode->gfid), EC_COUNT(sources, ec->nodes), + EC_COUNT(healed_sinks, ec->nodes)); + } + return ret; +} + +int +__ec_heal_mark_sinks(call_frame_t *frame, ec_t *ec, fd_t *fd, + uint64_t *versions, unsigned char *healed_sinks) +{ + int i = 0; + int ret = 0; + unsigned char *mark = NULL; + dict_t *xattrs = NULL; + default_args_cbk_t *replies = NULL; + unsigned char *output = NULL; + uint64_t versions_xattr[2] = {0}; + + EC_REPLIES_ALLOC(replies, ec->nodes); + xattrs = dict_new(); + if (!xattrs) { + ret = -ENOMEM; + goto out; + } + + mark = alloca0(ec->nodes); + for (i = 0; i < ec->nodes; i++) { + if (!healed_sinks[i]) + continue; + if ((versions[i] >> EC_SELFHEAL_BIT) & 1) + continue; + mark[i] = 1; + } + + if (EC_COUNT(mark, ec->nodes) == 0) + return 0; + + versions_xattr[EC_DATA_TXN] = hton64(1ULL << EC_SELFHEAL_BIT); + if (dict_set_static_bin(xattrs, EC_XATTR_VERSION, versions_xattr, + sizeof(versions_xattr))) { + ret = -ENOMEM; + goto out; + } + + output = alloca0(ec->nodes); + ret = cluster_fxattrop(ec->xl_list, mark, ec->nodes, replies, output, frame, + ec->xl, fd, GF_XATTROP_ADD_ARRAY64, xattrs, NULL); + for (i = 0; i < ec->nodes; i++) { + if (!output[i]) { + if (mark[i]) + healed_sinks[i] = 0; + continue; + } + versions[i] |= (1ULL << EC_SELFHEAL_BIT); + } + + if (EC_COUNT(healed_sinks, ec->nodes) == 0) { + ret = -ENOTCONN; + goto out; + } + ret = 0; + +out: + cluster_replies_wipe(replies, ec->nodes); + if (xattrs) + dict_unref(xattrs); + if (ret < 0) + gf_msg_debug(ec->xl->name, 0, "%s: heal failed %s", + uuid_utoa(fd->inode->gfid), strerror(-ret)); + return ret; +} + +int32_t +ec_manager_heal_block(ec_fop_data_t *fop, int32_t state) +{ + ec_heal_t *heal = fop->data; + heal->fop = fop; + + switch (state) { + case EC_STATE_INIT: + ec_owner_set(fop->frame, fop->frame->root); + + ec_heal_inodelk(heal, F_WRLCK, 1, 0, 0); + + return EC_STATE_HEAL_DATA_COPY; + + case EC_STATE_HEAL_DATA_COPY: + gf_msg_debug(fop->xl->name, 0, "%s: read/write starting", + uuid_utoa(heal->fd->inode->gfid)); + ec_heal_data_block(heal); + + return EC_STATE_HEAL_DATA_UNLOCK; + + case -EC_STATE_HEAL_DATA_COPY: + case -EC_STATE_HEAL_DATA_UNLOCK: + case EC_STATE_HEAL_DATA_UNLOCK: + ec_heal_inodelk(heal, F_UNLCK, 1, 0, 0); + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + if (fop->cbks.heal) { + fop->cbks.heal(fop->req_frame, fop->data, fop->xl, 0, 0, + (heal->good | heal->bad), heal->good, heal->bad, + 0, NULL); + } + + return EC_STATE_END; + case -EC_STATE_REPORT: + if (fop->cbks.heal) { + fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1, + fop->error, 0, 0, 0, 0, NULL); + } + + return EC_STATE_END; + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, 0, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +/*Takes lock */ +void +ec_heal_block(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_heal_cbk_t func, ec_heal_t *heal) +{ + ec_cbk_t callback = {.heal = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(HEAL) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, 0, target, fop_flags, + NULL, ec_manager_heal_block, callback, heal); + if (fop == NULL) + goto out; + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL); + } +} + +int32_t +ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, uintptr_t mask, + uintptr_t good, uintptr_t bad, uint32_t pending, + dict_t *xdata) +{ + ec_heal_t *heal = cookie; + + if (heal->fop) { + heal->fop->heal = NULL; + } + heal->fop = NULL; + heal->error = op_ret < 0 ? op_errno : 0; + syncbarrier_wake(heal->data); + return 0; +} + +int +ec_sync_heal_block(call_frame_t *frame, xlator_t *this, ec_heal_t *heal) +{ + ec_heal_block(frame, this, heal->bad | heal->good, EC_MINIMUM_ONE, + ec_heal_block_done, heal); + syncbarrier_wait(heal->data, 1); + if (heal->error != 0) { + return -heal->error; + } + if (heal->bad == 0) + return -ENOTCONN; + return 0; +} + +int +ec_rebuild_data(call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size, + unsigned char *sources, unsigned char *healed_sinks) +{ + ec_heal_t *heal = NULL; + int ret = 0; + syncbarrier_t barrier; + + if (syncbarrier_init(&barrier)) + return -ENOMEM; + + heal = alloca0(sizeof(*heal)); + heal->fd = fd_ref(fd); + heal->xl = ec->xl; + heal->data = &barrier; + ec_adjust_size_up(ec, &size, _gf_false); + heal->total_size = size; + heal->size = (128 * GF_UNIT_KB * (ec->self_heal_window_size)); + /* We need to adjust the size to a multiple of the stripe size of the + * volume. Otherwise writes would need to fill gaps (head and/or tail) + * with existent data from the bad bricks. This could be garbage on a + * damaged file or it could fail if there aren't enough bricks. */ + heal->size -= heal->size % ec->stripe_size; + heal->bad = ec_char_array_to_mask(healed_sinks, ec->nodes); + heal->good = ec_char_array_to_mask(sources, ec->nodes); + heal->iatt.ia_type = IA_IFREG; + LOCK_INIT(&heal->lock); + + for (heal->offset = 0; (heal->offset < size) && !heal->done; + heal->offset += heal->size) { + /* We immediately abort any heal if a shutdown request has been + * received to avoid delays. The healing of this file will be + * restarted by another SHD or other client that accesses the + * file. */ + if (ec->shutdown) { + gf_msg_debug(ec->xl->name, 0, + "Cancelling heal because " + "EC is stopping."); + ret = -ENOTCONN; + break; + } + + gf_msg_debug(ec->xl->name, 0, + "%s: sources: %d, sinks: " + "%d, offset: %" PRIu64 " bsize: %" PRIu64, + uuid_utoa(fd->inode->gfid), EC_COUNT(sources, ec->nodes), + EC_COUNT(healed_sinks, ec->nodes), heal->offset, + heal->size); + ret = ec_sync_heal_block(frame, ec->xl, heal); + if (ret < 0) + break; + } + memset(healed_sinks, 0, ec->nodes); + ec_mask_to_char_array(heal->bad, healed_sinks, ec->nodes); + fd_unref(heal->fd); + LOCK_DESTROY(&heal->lock); + syncbarrier_destroy(heal->data); + if (ret < 0) + gf_msg_debug(ec->xl->name, 0, "%s: heal failed %s", + uuid_utoa(fd->inode->gfid), strerror(-ret)); + return ret; +} + +int +__ec_heal_trim_sinks(call_frame_t *frame, ec_t *ec, fd_t *fd, + unsigned char *healed_sinks, unsigned char *trim, + uint64_t size) +{ + default_args_cbk_t *replies = NULL; + unsigned char *output = NULL; + int ret = 0; + int i = 0; + off_t trim_offset = 0; + + EC_REPLIES_ALLOC(replies, ec->nodes); + output = alloca0(ec->nodes); + + if (EC_COUNT(trim, ec->nodes) == 0) { + ret = 0; + goto out; + } + trim_offset = size; + ec_adjust_offset_up(ec, &trim_offset, _gf_true); + ret = cluster_ftruncate(ec->xl_list, trim, ec->nodes, replies, output, + frame, ec->xl, fd, trim_offset, NULL); + for (i = 0; i < ec->nodes; i++) { + if (!output[i] && trim[i]) + healed_sinks[i] = 0; + } + + if (EC_COUNT(healed_sinks, ec->nodes) == 0) { + ret = -ENOTCONN; + goto out; + } + +out: + cluster_replies_wipe(replies, ec->nodes); + if (ret < 0) + gf_msg_debug(ec->xl->name, 0, "%s: heal failed %s", + uuid_utoa(fd->inode->gfid), strerror(-ret)); + return ret; +} + +int +ec_data_undo_pending(call_frame_t *frame, ec_t *ec, fd_t *fd, dict_t *xattr, + uint64_t *versions, uint64_t *dirty, uint64_t *size, + int source, gf_boolean_t erase_dirty, int idx) +{ + uint64_t versions_xattr[2] = {0}; + uint64_t dirty_xattr[2] = {0}; + uint64_t allzero[2] = {0}; + uint64_t size_xattr = 0; + int ret = 0; + + versions_xattr[EC_DATA_TXN] = hton64(versions[source] - versions[idx]); + ret = dict_set_static_bin(xattr, EC_XATTR_VERSION, versions_xattr, + sizeof(versions_xattr)); + if (ret < 0) + goto out; + + size_xattr = hton64(size[source] - size[idx]); + ret = dict_set_static_bin(xattr, EC_XATTR_SIZE, &size_xattr, + sizeof(size_xattr)); + if (ret < 0) + goto out; + + if (erase_dirty) { + dirty_xattr[EC_DATA_TXN] = hton64(-dirty[idx]); + ret = dict_set_static_bin(xattr, EC_XATTR_DIRTY, dirty_xattr, + sizeof(dirty_xattr)); + if (ret < 0) + goto out; + } + + if ((memcmp(versions_xattr, allzero, sizeof(allzero)) == 0) && + (memcmp(dirty_xattr, allzero, sizeof(allzero)) == 0) && + (size_xattr == 0)) { + ret = 0; + goto out; + } + + ret = syncop_fxattrop(ec->xl_list[idx], fd, GF_XATTROP_ADD_ARRAY64, xattr, + NULL, NULL, NULL); +out: + return ret; +} + +int +__ec_fd_data_adjust_versions(call_frame_t *frame, ec_t *ec, fd_t *fd, + unsigned char *sources, + unsigned char *healed_sinks, uint64_t *versions, + uint64_t *dirty, uint64_t *size) +{ + dict_t *xattr = NULL; + int i = 0; + int ret = 0; + int op_ret = 0; + int source = -1; + gf_boolean_t erase_dirty = _gf_false; + + xattr = dict_new(); + if (!xattr) { + op_ret = -ENOMEM; + goto out; + } + + /* dirty xattr represents if the file needs heal. Unless all the + * copies are healed, don't erase it */ + if (EC_COUNT(sources, ec->nodes) + EC_COUNT(healed_sinks, ec->nodes) == + ec->nodes) + erase_dirty = _gf_true; + + for (i = 0; i < ec->nodes; i++) { + if (sources[i]) { + source = i; + break; + } + } + + if (source == -1) { + op_ret = -ENOTCONN; + goto out; + } + + for (i = 0; i < ec->nodes; i++) { + if (healed_sinks[i]) { + ret = ec_data_undo_pending(frame, ec, fd, xattr, versions, dirty, + size, source, erase_dirty, i); + if (ret < 0) + goto out; + } + } + + if (!erase_dirty) + goto out; + + for (i = 0; i < ec->nodes; i++) { + if (sources[i]) { + ret = ec_data_undo_pending(frame, ec, fd, xattr, versions, dirty, + size, source, erase_dirty, i); + if (ret < 0) + continue; + } + } +out: + if (xattr) + dict_unref(xattr); + return op_ret; +} + +int +ec_restore_time_and_adjust_versions(call_frame_t *frame, ec_t *ec, fd_t *fd, + unsigned char *sources, + unsigned char *healed_sinks, + uint64_t *versions, uint64_t *dirty, + uint64_t *size) +{ + unsigned char *locked_on = NULL; + unsigned char *participants = NULL; + unsigned char *output = NULL; + default_args_cbk_t *replies = NULL; + unsigned char *postsh_sources = NULL; + unsigned char *postsh_healed_sinks = NULL; + unsigned char *postsh_trim = NULL; + uint64_t *postsh_versions = NULL; + uint64_t *postsh_dirty = NULL; + uint64_t *postsh_size = NULL; + int ret = 0; + int i = 0; + struct iatt source_buf = {0}; + loc_t loc = {0}; + + locked_on = alloca0(ec->nodes); + output = alloca0(ec->nodes); + participants = alloca0(ec->nodes); + postsh_sources = alloca0(ec->nodes); + postsh_healed_sinks = alloca0(ec->nodes); + postsh_trim = alloca0(ec->nodes); + postsh_versions = alloca0(ec->nodes * sizeof(*postsh_versions)); + postsh_dirty = alloca0(ec->nodes * sizeof(*postsh_dirty)); + postsh_size = alloca0(ec->nodes * sizeof(*postsh_size)); + + for (i = 0; i < ec->nodes; i++) { + if (healed_sinks[i] || sources[i]) + participants[i] = 1; + } + + EC_REPLIES_ALLOC(replies, ec->nodes); + ret = cluster_inodelk(ec->xl_list, participants, ec->nodes, replies, + locked_on, frame, ec->xl, ec->xl->name, fd->inode, 0, + 0); + { + if (ret <= ec->fragments) { + gf_msg_debug(ec->xl->name, 0, + "%s: Skipping heal " + "as only %d number of subvolumes could " + "be locked", + uuid_utoa(fd->inode->gfid), ret); + ret = -ENOTCONN; + goto unlock; + } + + ret = __ec_heal_data_prepare(frame, ec, fd, locked_on, postsh_versions, + postsh_dirty, postsh_size, postsh_sources, + postsh_healed_sinks, postsh_trim, + &source_buf); + if (ret < 0) + goto unlock; + + loc.inode = inode_ref(fd->inode); + gf_uuid_copy(loc.gfid, fd->inode->gfid); + ret = cluster_setattr( + ec->xl_list, healed_sinks, ec->nodes, replies, output, frame, + ec->xl, &loc, &source_buf, + GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME, NULL); + EC_INTERSECT(healed_sinks, healed_sinks, output, ec->nodes); + if (EC_COUNT(healed_sinks, ec->nodes) == 0) { + ret = -ENOTCONN; + goto unlock; + } + ret = __ec_fd_data_adjust_versions(frame, ec, fd, sources, healed_sinks, + versions, dirty, size); + } +unlock: + cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame, + ec->xl, ec->xl->name, fd->inode, 0, 0); + cluster_replies_wipe(replies, ec->nodes); + loc_wipe(&loc); + return ret; +} + +int +__ec_heal_data(call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on, + unsigned char *sources, unsigned char *healed_sinks) +{ + unsigned char *locked_on = NULL; + unsigned char *output = NULL; + uint64_t *versions = NULL; + uint64_t *dirty = NULL; + uint64_t *size = NULL; + unsigned char *trim = NULL; + default_args_cbk_t *replies = NULL; + int ret = 0; + int source = 0; + + locked_on = alloca0(ec->nodes); + output = alloca0(ec->nodes); + trim = alloca0(ec->nodes); + versions = alloca0(ec->nodes * sizeof(*versions)); + dirty = alloca0(ec->nodes * sizeof(*dirty)); + size = alloca0(ec->nodes * sizeof(*size)); + + EC_REPLIES_ALLOC(replies, ec->nodes); + ret = cluster_inodelk(ec->xl_list, heal_on, ec->nodes, replies, locked_on, + frame, ec->xl, ec->xl->name, fd->inode, 0, 0); + { + if (ret <= ec->fragments) { + gf_msg_debug(ec->xl->name, 0, + "%s: Skipping heal " + "as only %d number of subvolumes could " + "be locked", + uuid_utoa(fd->inode->gfid), ret); + ret = -ENOTCONN; + goto unlock; + } + + ret = __ec_heal_data_prepare(frame, ec, fd, locked_on, versions, dirty, + size, sources, healed_sinks, trim, NULL); + if (ret < 0) + goto unlock; + + if (EC_COUNT(healed_sinks, ec->nodes) == 0) { + ret = __ec_fd_data_adjust_versions( + frame, ec, fd, sources, healed_sinks, versions, dirty, size); + goto unlock; + } + + source = ret; + ret = __ec_heal_mark_sinks(frame, ec, fd, versions, healed_sinks); + if (ret < 0) + goto unlock; + + ret = __ec_heal_trim_sinks(frame, ec, fd, healed_sinks, trim, + size[source]); + } +unlock: + cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame, + ec->xl, ec->xl->name, fd->inode, 0, 0); + if (ret < 0) + goto out; + + if (EC_COUNT(healed_sinks, ec->nodes) == 0) + goto out; + + gf_msg_debug(ec->xl->name, 0, + "%s: sources: %d, sinks: " + "%d", + uuid_utoa(fd->inode->gfid), EC_COUNT(sources, ec->nodes), + EC_COUNT(healed_sinks, ec->nodes)); + + ret = ec_rebuild_data(frame, ec, fd, size[source], sources, healed_sinks); + if (ret < 0) + goto out; + + ret = ec_restore_time_and_adjust_versions( + frame, ec, fd, sources, healed_sinks, versions, dirty, size); +out: + cluster_replies_wipe(replies, ec->nodes); + return ret; +} + +int +ec_heal_data(call_frame_t *frame, ec_t *ec, gf_boolean_t block, inode_t *inode, + unsigned char *sources, unsigned char *healed_sinks) +{ + unsigned char *locked_on = NULL; + unsigned char *up_subvols = NULL; + unsigned char *output = NULL; + default_args_cbk_t *replies = NULL; + fd_t *fd = NULL; + loc_t loc = {0}; + char selfheal_domain[1024] = {0}; + int ret = 0; + + EC_REPLIES_ALLOC(replies, ec->nodes); + + locked_on = alloca0(ec->nodes); + output = alloca0(ec->nodes); + up_subvols = alloca0(ec->nodes); + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + fd = fd_create(inode, 0); + if (!fd) { + ret = -ENOMEM; + goto out; + } + + ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes); + + ret = cluster_open(ec->xl_list, up_subvols, ec->nodes, replies, output, + frame, ec->xl, &loc, O_RDWR | O_LARGEFILE, fd, NULL); + if (ret <= ec->fragments) { + ret = -ENOTCONN; + goto out; + } + + fd_bind(fd); + sprintf(selfheal_domain, "%s:self-heal", ec->xl->name); + /*If other processes are already doing the heal, don't block*/ + if (block) { + ret = cluster_inodelk(ec->xl_list, output, ec->nodes, replies, + locked_on, frame, ec->xl, selfheal_domain, inode, + 0, 0); + } else { + ret = cluster_tiebreaker_inodelk(ec->xl_list, output, ec->nodes, + replies, locked_on, frame, ec->xl, + selfheal_domain, inode, 0, 0); + } + { + if (ret <= ec->fragments) { + gf_msg_debug(ec->xl->name, 0, + "%s: Skipping heal " + "as only %d number of subvolumes could " + "be locked", + uuid_utoa(inode->gfid), ret); + ret = -ENOTCONN; + goto unlock; + } + ret = __ec_heal_data(frame, ec, fd, locked_on, sources, healed_sinks); + } +unlock: + cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame, + ec->xl, selfheal_domain, inode, 0, 0); +out: + if (fd) + fd_unref(fd); + loc_wipe(&loc); + cluster_replies_wipe(replies, ec->nodes); + return ret; +} + +int +ec_heal_purge_stale_index(call_frame_t *frame, ec_t *ec, inode_t *inode) +{ + int i = 0; + int ret = 0; + dict_t **xattr = NULL; + loc_t loc = {0}; + uint64_t dirty_xattr[EC_VERSION_SIZE] = {0}; + unsigned char *on = NULL; + default_args_cbk_t *replies = NULL; + dict_t *dict = NULL; + + /* Allocate the required memory */ + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + on = alloca0(ec->nodes); + EC_REPLIES_ALLOC(replies, ec->nodes); + xattr = GF_CALLOC(ec->nodes, sizeof(*xattr), gf_common_mt_pointer); + if (!xattr) { + ret = -ENOMEM; + goto out; + } + dict = dict_new(); + if (!dict) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < ec->nodes; i++) { + xattr[i] = dict; + on[i] = 1; + } + ret = dict_set_static_bin(dict, EC_XATTR_DIRTY, dirty_xattr, + (sizeof(*dirty_xattr) * EC_VERSION_SIZE)); + if (ret < 0) { + ret = -ENOMEM; + goto out; + } + PARALLEL_FOP_ONLIST(ec->xl_list, on, ec->nodes, replies, frame, + ec_wind_xattrop_parallel, &loc, GF_XATTROP_ADD_ARRAY64, + xattr, NULL); +out: + if (dict) { + dict_unref(dict); + } + if (xattr) { + GF_FREE(xattr); + } + cluster_replies_wipe(replies, ec->nodes); + loc_wipe(&loc); + return ret; +} + +void +ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial) +{ + call_frame_t *frame = NULL; + unsigned char *participants = NULL; + unsigned char *msources = NULL; + unsigned char *mhealed_sinks = NULL; + unsigned char *sources = NULL; + unsigned char *healed_sinks = NULL; + ec_t *ec = NULL; + int ret = 0; + int op_ret = 0; + int op_errno = 0; + intptr_t mgood = 0; + intptr_t mbad = 0; + intptr_t good = 0; + intptr_t bad = 0; + uint32_t pending = 0; + ec_fop_data_t *fop = data; + gf_boolean_t blocking = _gf_false; + ec_heal_need_t need_heal = EC_HEAL_NONEED; + unsigned char *up_subvols = NULL; + char up_bricks[32]; + + ec = this->private; + + /* If it is heal request from getxattr, complete the heal and then + * unwind, if it is ec_heal with NULL as frame then no need to block + * the heal as the caller doesn't care about its completion. In case + * of heald whichever gets tiebreaking inodelk will take care of the + * heal, so no need to block*/ + if (fop->req_frame && !ec->shd.iamshd) + blocking = _gf_true; + + frame = create_frame(this, this->ctx->pool); + if (!frame) + goto out; + + ec_owner_set(frame, frame->root); + /*Do heal as root*/ + frame->root->uid = 0; + frame->root->gid = 0; + /*Mark the fops as internal*/ + frame->root->pid = GF_CLIENT_PID_SELF_HEALD; + participants = alloca0(ec->nodes); + ec_mask_to_char_array(ec->xl_up, participants, ec->nodes); + + up_subvols = alloca0(ec->nodes); + ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes); + + if (loc->name && strlen(loc->name)) { + ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name, + participants); + if (ret >= 0) { + gf_msg_debug(this->name, 0, + "%s: name heal " + "successful on %" PRIXPTR, + loc->path, + ec_char_array_to_mask(participants, ec->nodes)); + } else { + gf_msg_debug( + this->name, 0, + "%s: name heal " + "failed. ret = %d, subvolumes up = %s", + loc->path, ret, + ec_bin(up_bricks, sizeof(up_bricks), ec->xl_up, ec->nodes)); + } + } + + /* Mount triggers heal only when it detects that it must need heal, shd + * triggers heals periodically which need not be thorough*/ + if (ec->shd.iamshd && (ret <= 0)) { + ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false, + &need_heal); + + if (need_heal == EC_HEAL_PURGE_INDEX) { + gf_msg(ec->xl->name, GF_LOG_INFO, 0, EC_MSG_HEAL_FAIL, + "Index entry needs to be purged for: %s ", + uuid_utoa(loc->gfid)); + /* We need to send zero-xattrop so that stale index entry could be + * removed. We need not take lock on this entry to do so as + * xattrop on a brick is atomic. */ + ec_heal_purge_stale_index(frame, ec, loc->inode); + goto out; + } else if (need_heal == EC_HEAL_NONEED) { + gf_msg(ec->xl->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_FAIL, + "Heal is not required for : %s ", uuid_utoa(loc->gfid)); + goto out; + } + } + + sources = alloca0(ec->nodes); + healed_sinks = alloca0(ec->nodes); + if (IA_ISREG(loc->inode->ia_type)) { + ret = ec_heal_data(frame, ec, blocking, loc->inode, sources, + healed_sinks); + } else if (IA_ISDIR(loc->inode->ia_type) && !partial) { + ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks, + &pending); + } else { + ret = 0; + memcpy(sources, participants, ec->nodes); + memcpy(healed_sinks, participants, ec->nodes); + } + + if (ret == 0) { + good = ec_char_array_to_mask(sources, ec->nodes); + bad = ec_char_array_to_mask(healed_sinks, ec->nodes); + } else { + op_ret = -1; + op_errno = -ret; + } + msources = alloca0(ec->nodes); + mhealed_sinks = alloca0(ec->nodes); + ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks); + if (ret == 0) { + mgood = ec_char_array_to_mask(msources, ec->nodes); + mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes); + } else { + op_ret = -1; + op_errno = -ret; + } + +out: + ec_reset_entry_healing(fop); + if (fop->cbks.heal) { + fop->cbks.heal(fop->req_frame, fop->data, fop->xl, op_ret, op_errno, + ec_char_array_to_mask(participants, ec->nodes), + mgood & good, mbad & bad, pending, NULL); + } + if (frame) + STACK_DESTROY(frame->root); + + return; +} + +int +ec_synctask_heal_wrap(void *opaque) +{ + ec_fop_data_t *fop = opaque; + ec_heal_do(fop->xl, fop, &fop->loc[0], fop->int32); + return 0; +} + +int +ec_heal_done(int ret, call_frame_t *heal, void *opaque) +{ + if (opaque) + ec_fop_data_release(opaque); + return 0; +} + +ec_fop_data_t * +__ec_dequeue_heals(ec_t *ec) +{ + ec_fop_data_t *fop = NULL; + + if (list_empty(&ec->heal_waiting)) + goto none; + + if ((ec->background_heals > 0) && (ec->healers >= ec->background_heals)) + goto none; + + fop = list_entry(ec->heal_waiting.next, ec_fop_data_t, healer); + ec->heal_waiters--; + list_del_init(&fop->healer); + list_add(&fop->healer, &ec->healing); + ec->healers++; + return fop; +none: + gf_msg_debug(ec->xl->name, 0, "Num healers: %d, Num Waiters: %d", + ec->healers, ec->heal_waiters); + return NULL; +} + +void +ec_heal_fail(ec_t *ec, ec_fop_data_t *fop) +{ + if (fop->cbks.heal) { + fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0, + 0, 0, NULL); + } + ec_fop_data_release(fop); +} + +void +ec_launch_heal(ec_t *ec, ec_fop_data_t *fop) +{ + int ret = 0; + call_frame_t *frame = NULL; + + frame = create_frame(ec->xl, ec->xl->ctx->pool); + if (!frame) { + ret = -1; + goto out; + } + + ec_owner_set(frame, frame->root); + /*Do heal as root*/ + frame->root->uid = 0; + frame->root->gid = 0; + /*Mark the fops as internal*/ + frame->root->pid = GF_CLIENT_PID_SELF_HEALD; + + ret = synctask_new(ec->xl->ctx->env, ec_synctask_heal_wrap, ec_heal_done, + frame, fop); +out: + if (ret < 0) { + ec_fop_set_error(fop, ENOMEM); + ec_heal_fail(ec, fop); + } + + if (frame) + STACK_DESTROY(frame->root); +} + +void +ec_handle_healers_done(ec_fop_data_t *fop) +{ + ec_t *ec = fop->xl->private; + ec_fop_data_t *heal_fop = NULL; + + if (list_empty(&fop->healer)) + return; + + LOCK(&ec->lock); + + list_del_init(&fop->healer); + + do { + ec->healers--; + heal_fop = __ec_dequeue_heals(ec); + + if ((heal_fop != NULL) && ec->shutdown) { + /* This will prevent ec_handle_healers_done() to be + * called recursively. That would be problematic if + * the queue is too big. */ + list_del_init(&heal_fop->healer); + + UNLOCK(&ec->lock); + + ec_fop_set_error(fop, ENOTCONN); + ec_heal_fail(ec, heal_fop); + + LOCK(&ec->lock); + } + } while ((heal_fop != NULL) && ec->shutdown); + + UNLOCK(&ec->lock); + + if (heal_fop) + ec_launch_heal(ec, heal_fop); +} + +gf_boolean_t +ec_is_entry_healing(ec_fop_data_t *fop) +{ + ec_inode_t *ctx = NULL; + int32_t heal_count = 0; + loc_t *loc = NULL; + + loc = &fop->loc[0]; + + LOCK(&loc->inode->lock); + { + ctx = __ec_inode_get(loc->inode, fop->xl); + if (ctx) { + heal_count = ctx->heal_count; + } + } + UNLOCK(&loc->inode->lock); + GF_ASSERT(heal_count >= 0); + return heal_count; +} + +void +ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop) +{ + gf_boolean_t can_heal = _gf_true; + ec_t *ec = this->private; + ec_fop_data_t *fop_rel = NULL; + + if (fop->req_frame == NULL) { + LOCK(&ec->lock); + { + if ((ec->background_heals > 0) && + (ec->heal_wait_qlen + ec->background_heals) > + (ec->heal_waiters + ec->healers)) { + if (!ec_is_entry_healing(fop)) { + list_add_tail(&fop->healer, &ec->heal_waiting); + ec->heal_waiters++; + ec_set_entry_healing(fop); + } else { + fop_rel = fop; + } + fop = __ec_dequeue_heals(ec); + } else { + can_heal = _gf_false; + } + } + UNLOCK(&ec->lock); + } + + if (can_heal) { + if (fop) { + if (fop->req_frame != NULL) { + ec_set_entry_healing(fop); + } + ec_launch_heal(ec, fop); + } + } else { + gf_msg_debug(this->name, 0, + "Max number of heals are " + "pending, background self-heal rejected"); + ec_fop_set_error(fop, EBUSY); + ec_heal_fail(ec, fop); + } + if (fop_rel) { + ec_heal_done(0, NULL, fop_rel); + } +} + +void +ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_heal_cbk_t func, void *data, loc_t *loc, + int32_t partial, dict_t *xdata) +{ + ec_cbk_t callback = {.heal = func}; + ec_fop_data_t *fop = NULL; + int32_t err = EINVAL; + + gf_msg_trace("ec", 0, "EC(HEAL) %p", frame); + + VALIDATE_OR_GOTO(this, fail); + GF_VALIDATE_OR_GOTO(this->name, this->private, fail); + + if (!loc || !loc->inode || gf_uuid_is_null(loc->inode->gfid)) + goto fail; + + if (frame && frame->local) + goto fail; + fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, 0, target, fop_flags, + NULL, NULL, callback, data); + + err = ENOMEM; + + if (fop == NULL) + goto fail; + + fop->int32 = partial; + + if (loc) { + if (loc_copy(&fop->loc[0], loc) != 0) + goto fail; + } + + if (xdata) + fop->xdata = dict_ref(xdata); + + ec_heal_throttle(this, fop); + + return; + +fail: + if (fop) + ec_fop_data_release(fop); + if (func) + func(frame, data, this, -1, err, 0, 0, 0, 0, NULL); +} + +int +ec_replace_heal_done(int ret, call_frame_t *heal, void *opaque) +{ + ec_t *ec = opaque; + gf_boolean_t last_fop = _gf_false; + + if (GF_ATOMIC_DEC(ec->async_fop_count) == 0) { + LOCK(&ec->lock); + { + last_fop = __ec_is_last_fop(ec); + } + UNLOCK(&ec->lock); + } + gf_msg_debug(ec->xl->name, 0, "getxattr on bricks is done ret %d", ret); + + if (last_fop) + ec_pending_fops_completed(ec); + + return 0; +} + +int32_t +ec_replace_heal(ec_t *ec, inode_t *inode) +{ + loc_t loc = {0}; + int ret = 0; + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + ret = syncop_getxattr(ec->xl, &loc, NULL, EC_XATTR_HEAL, NULL, NULL); + if (ret < 0) + gf_msg_debug(ec->xl->name, 0, "Heal failed for replace brick ret = %d", + ret); + + /* Once the root inode has been checked, it might have triggered a + * self-heal on it after a replace brick command or for some other + * reason. It can also happen that the volume already had damaged + * files in the index, even if the heal on the root directory failed. + * In both cases we need to wake all index healers to continue + * healing remaining entries that are marked as dirty. */ + ec_shd_index_healer_wake(ec); + + loc_wipe(&loc); + return ret; +} + +int32_t +ec_replace_brick_heal_wrap(void *opaque) +{ + ec_t *ec = opaque; + inode_table_t *itable = NULL; + int32_t ret = -1; + + if (ec->xl->itable) + itable = ec->xl->itable; + else + goto out; + + if (xlator_is_cleanup_starting(ec->xl)) + goto out; + + ret = ec_replace_heal(ec, itable->root); +out: + return ret; +} + +int32_t +ec_launch_replace_heal(ec_t *ec) +{ + int ret = -1; + + ret = synctask_new(ec->xl->ctx->env, ec_replace_brick_heal_wrap, + ec_replace_heal_done, NULL, ec); + + if (ret < 0) { + gf_msg_debug(ec->xl->name, 0, "Heal failed for replace brick ret = %d", + ret); + ec_replace_heal_done(-1, NULL, ec); + } + + return ret; +} + +int32_t +ec_set_heal_info(dict_t **dict_rsp, char *status) +{ + dict_t *dict = NULL; + int ret = 0; + + dict = dict_new(); + if (!dict) { + ret = -ENOMEM; + goto out; + } + ret = dict_set_str(dict, "heal-info", status); + if (ret) { + gf_msg(THIS->name, GF_LOG_WARNING, -ret, EC_MSG_HEAL_FAIL, + "Failed to set heal-info key to " + "%s", + status); + dict_unref(dict); + dict = NULL; + } + *dict_rsp = dict; +out: + return ret; +} + +static int32_t +_need_heal_calculate(ec_t *ec, uint64_t *dirty, unsigned char *sources, + gf_boolean_t self_locked, int32_t lock_count, + ec_heal_need_t *need_heal, uint64_t *versions) +{ + int i = 0; + int source_count = 0; + + source_count = EC_COUNT(sources, ec->nodes); + if (source_count == ec->nodes) { + *need_heal = EC_HEAL_NONEED; + if (self_locked || lock_count == 0) { + for (i = 0; i < ec->nodes; i++) { + if (dirty[i] || (versions[i] != versions[0])) { + *need_heal = EC_HEAL_MUST; + goto out; + } + } + /* If lock count is 0, all dirty flags are 0 and all the + * versions are macthing then why are we here. It looks + * like something went wrong while removing the index entries + * after completing a successful heal or fop. In this case + * we need to remove this index entry to avoid triggering heal + * in a loop and causing lookups again and again*/ + *need_heal = EC_HEAL_PURGE_INDEX; + } else { + for (i = 0; i < ec->nodes; i++) { + /* Since each lock can only increment the dirty + * count once, if dirty is > 1 it means that + * another operation has left the dirty count + * set and this indicates a problem in the + * inode.*/ + if (dirty[i] > 1) { + *need_heal = EC_HEAL_MUST; + goto out; + } + if (dirty[i] != dirty[0] || (versions[i] != versions[0])) { + *need_heal = EC_HEAL_MAYBE; + } + } + } + } else { + *need_heal = EC_HEAL_MUST; + } + +out: + return source_count; +} + +static int32_t +ec_need_metadata_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies, + int32_t lock_count, gf_boolean_t self_locked, + gf_boolean_t thorough, ec_heal_need_t *need_heal) +{ + uint64_t *dirty = NULL; + unsigned char *sources = NULL; + unsigned char *healed_sinks = NULL; + uint64_t *meta_versions = NULL; + int ret = 0; + + sources = alloca0(ec->nodes); + healed_sinks = alloca0(ec->nodes); + dirty = alloca0(ec->nodes * sizeof(*dirty)); + meta_versions = alloca0(ec->nodes * sizeof(*meta_versions)); + ret = ec_heal_metadata_find_direction(ec, replies, meta_versions, dirty, + sources, healed_sinks); + if (ret < 0 && ret != -EIO) { + goto out; + } + + ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count, + need_heal, meta_versions); +out: + return ret; +} + +static int32_t +ec_need_data_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies, + int32_t lock_count, gf_boolean_t self_locked, + gf_boolean_t thorough, ec_heal_need_t *need_heal) +{ + uint64_t *dirty = NULL; + unsigned char *sources = NULL; + unsigned char *healed_sinks = NULL; + uint64_t *data_versions = NULL; + uint64_t *size = NULL; + int ret = 0; + + sources = alloca0(ec->nodes); + healed_sinks = alloca0(ec->nodes); + dirty = alloca0(ec->nodes * sizeof(*dirty)); + data_versions = alloca0(ec->nodes * sizeof(*data_versions)); + size = alloca0(ec->nodes * sizeof(*size)); + + /* When dd is going on and heal info is called there is a very good + * chance for on disk sizes to mismatch even though nothing is wrong + * we don't need ondisk size check there. But if the file is either + * self-locked or the caller wants a thorough check then make sure to + * perform on disk check also. */ + ret = ec_heal_data_find_direction( + ec, replies, data_versions, dirty, size, sources, healed_sinks, + self_locked || thorough, EC_COMBINE_XDATA); + if (ret < 0 && ret != -EIO) { + goto out; + } + + ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count, + need_heal, data_versions); +out: + return ret; +} + +static int32_t +ec_need_entry_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies, + int32_t lock_count, gf_boolean_t self_locked, + gf_boolean_t thorough, ec_heal_need_t *need_heal) +{ + uint64_t *dirty = NULL; + unsigned char *sources = NULL; + unsigned char *healed_sinks = NULL; + uint64_t *data_versions = NULL; + int ret = 0; + + sources = alloca0(ec->nodes); + healed_sinks = alloca0(ec->nodes); + dirty = alloca0(ec->nodes * sizeof(*dirty)); + data_versions = alloca0(ec->nodes * sizeof(*data_versions)); + + ret = ec_heal_entry_find_direction(ec, replies, data_versions, dirty, + sources, healed_sinks); + if (ret < 0 && ret != -EIO) { + goto out; + } + + ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count, + need_heal, data_versions); +out: + return ret; +} + +static int32_t +ec_need_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies, + int32_t lock_count, gf_boolean_t self_locked, + gf_boolean_t thorough, ec_heal_need_t *need_heal) +{ + int ret = 0; + + ret = ec_need_metadata_heal(ec, inode, replies, lock_count, self_locked, + thorough, need_heal); + if (ret < 0) + goto out; + + if (*need_heal == EC_HEAL_MUST) + goto out; + + if (inode->ia_type == IA_IFREG) { + ret = ec_need_data_heal(ec, inode, replies, lock_count, self_locked, + thorough, need_heal); + } else if (inode->ia_type == IA_IFDIR) { + ret = ec_need_entry_heal(ec, inode, replies, lock_count, self_locked, + thorough, need_heal); + } + +out: + return ret; +} + +int32_t +ec_heal_inspect(call_frame_t *frame, ec_t *ec, inode_t *inode, + unsigned char *locked_on, gf_boolean_t self_locked, + gf_boolean_t thorough, ec_heal_need_t *need_heal) +{ + loc_t loc = {0}; + int i = 0; + int ret = 0; + dict_t *xdata = NULL; + uint64_t zero_array[2] = {0}; + uint64_t zero_value = 0; + unsigned char *output = NULL; + default_args_cbk_t *replies = NULL; + int32_t lock_count = 0; + + EC_REPLIES_ALLOC(replies, ec->nodes); + output = alloca0(ec->nodes); + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + xdata = dict_new(); + if (!xdata || + dict_set_static_bin(xdata, EC_XATTR_VERSION, zero_array, + sizeof(zero_array)) || + dict_set_static_bin(xdata, EC_XATTR_DIRTY, zero_array, + sizeof(zero_array)) || + dict_set_static_bin(xdata, EC_XATTR_SIZE, &zero_value, + sizeof(zero_value))) { + ret = -ENOMEM; + goto out; + } + + if (!self_locked) { + ret = dict_set_str(xdata, GLUSTERFS_INODELK_DOM_COUNT, ec->xl->name); + if (ret) { + ret = -ENOMEM; + goto out; + } + } + + ret = cluster_lookup(ec->xl_list, locked_on, ec->nodes, replies, output, + frame, ec->xl, &loc, xdata); + + if (ret != ec->nodes) { + ret = ec->nodes; + *need_heal = EC_HEAL_MUST; + goto out; + } + + if (self_locked) + goto need_heal; + + for (i = 0; i < ec->nodes; i++) { + if (!output[i] || !replies[i].xdata) { + continue; + } + if ((dict_get_int32(replies[i].xdata, GLUSTERFS_INODELK_COUNT, + &lock_count) == 0) && + lock_count > 0) { + break; + } + } +need_heal: + ret = ec_need_heal(ec, inode, replies, lock_count, self_locked, thorough, + need_heal); +out: + cluster_replies_wipe(replies, ec->nodes); + loc_wipe(&loc); + if (xdata) { + dict_unref(xdata); + } + return ret; +} + +int32_t +ec_heal_locked_inspect(call_frame_t *frame, ec_t *ec, inode_t *inode, + ec_heal_need_t *need_heal) +{ + unsigned char *locked_on = NULL; + unsigned char *up_subvols = NULL; + unsigned char *output = NULL; + default_args_cbk_t *replies = NULL; + int ret = 0; + + EC_REPLIES_ALLOC(replies, ec->nodes); + locked_on = alloca0(ec->nodes); + output = alloca0(ec->nodes); + up_subvols = alloca0(ec->nodes); + ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes); + + ret = cluster_inodelk(ec->xl_list, up_subvols, ec->nodes, replies, + locked_on, frame, ec->xl, ec->xl->name, inode, 0, 0); + if (ret != ec->nodes) { + *need_heal = EC_HEAL_MUST; + goto unlock; + } + ret = ec_heal_inspect(frame, ec, inode, locked_on, _gf_true, _gf_true, + need_heal); +unlock: + cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame, + ec->xl, ec->xl->name, inode, 0, 0); + cluster_replies_wipe(replies, ec->nodes); + return ret; +} + +int32_t +ec_get_heal_info(xlator_t *this, loc_t *entry_loc, dict_t **dict_rsp) +{ + int ret = -ENOMEM; + ec_heal_need_t need_heal = EC_HEAL_NONEED; + call_frame_t *frame = NULL; + ec_t *ec = NULL; + unsigned char *up_subvols = NULL; + loc_t loc = { + 0, + }; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, entry_loc, out); + + ec = this->private; + up_subvols = alloca0(ec->nodes); + ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes); + + if (EC_COUNT(up_subvols, ec->nodes) != ec->nodes) { + need_heal = EC_HEAL_MUST; + goto set_heal; + } + frame = create_frame(this, this->ctx->pool); + if (!frame) { + goto out; + } + ec_owner_set(frame, frame->root); + frame->root->uid = 0; + frame->root->gid = 0; + frame->root->pid = GF_CLIENT_PID_SELF_HEALD; + + if (loc_copy(&loc, entry_loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + goto out; + } + if (!loc.inode) { + ret = syncop_inode_find(this, this, loc.gfid, &loc.inode, NULL, NULL); + if (ret < 0) + goto out; + } + + ret = ec_heal_inspect(frame, ec, loc.inode, up_subvols, _gf_false, + _gf_false, &need_heal); + if (ret == ec->nodes && need_heal != EC_HEAL_MAYBE) { + goto set_heal; + } + need_heal = EC_HEAL_NONEED; + ret = ec_heal_locked_inspect(frame, ec, loc.inode, &need_heal); + if (ret < 0) + goto out; +set_heal: + if (need_heal == EC_HEAL_MUST) { + ret = ec_set_heal_info(dict_rsp, "heal"); + } else { + ret = ec_set_heal_info(dict_rsp, "no-heal"); + } +out: + if (frame) { + STACK_DESTROY(frame->root); + } + loc_wipe(&loc); + return ret; +} diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c new file mode 100644 index 00000000000..5c1586bc9c5 --- /dev/null +++ b/xlators/cluster/ec/src/ec-heald.c @@ -0,0 +1,681 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <glusterfs/defaults.h> +#include <glusterfs/compat-errno.h> +#include "ec.h" +#include "ec-messages.h" +#include "ec-heald.h" +#include "ec-mem-types.h" +#include <glusterfs/syncop.h> +#include <glusterfs/syncop-utils.h> +#include "protocol-common.h" + +#define NTH_INDEX_HEALER(this, n) \ + (&((((ec_t *)this->private))->shd.index_healers[n])) +#define NTH_FULL_HEALER(this, n) \ + (&((((ec_t *)this->private))->shd.full_healers[n])) + +gf_boolean_t +ec_shd_is_subvol_local(xlator_t *this, int subvol) +{ + ec_t *ec = NULL; + gf_boolean_t is_local = _gf_false; + loc_t loc = { + 0, + }; + + ec = this->private; + loc.inode = this->itable->root; + syncop_is_subvol_local(ec->xl_list[subvol], &loc, &is_local); + return is_local; +} + +char * +ec_subvol_name(xlator_t *this, int subvol) +{ + ec_t *ec = NULL; + + ec = this->private; + if (subvol < 0 || subvol > ec->nodes) + return NULL; + + return ec->xl_list[subvol]->name; +} + +int +__ec_shd_healer_wait(struct subvol_healer *healer) +{ + ec_t *ec = NULL; + struct timespec wait_till = { + 0, + }; + int ret = 0; + + ec = healer->this->private; + +disabled_loop: + wait_till.tv_sec = gf_time() + ec->shd.timeout; + + while (!healer->rerun) { + ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till); + if (ret == ETIMEDOUT) + break; + } + + if (ec->shutdown) { + healer->running = _gf_false; + return -1; + } + + ret = healer->rerun; + healer->rerun = 0; + + if (!ec->shd.enabled || !ec->up) + goto disabled_loop; + + return ret; +} + +int +ec_shd_healer_wait(struct subvol_healer *healer) +{ + int ret = 0; + + pthread_mutex_lock(&healer->mutex); + { + ret = __ec_shd_healer_wait(healer); + } + pthread_mutex_unlock(&healer->mutex); + + return ret; +} + +int +ec_shd_index_inode(xlator_t *this, xlator_t *subvol, inode_t **inode) +{ + loc_t rootloc = { + 0, + }; + int ret = 0; + dict_t *xattr = NULL; + void *index_gfid = NULL; + + *inode = NULL; + rootloc.inode = inode_ref(this->itable->root); + gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid); + + ret = syncop_getxattr(subvol, &rootloc, &xattr, GF_XATTROP_INDEX_GFID, NULL, + NULL); + if (ret < 0) + goto out; + if (!xattr) { + ret = -EINVAL; + goto out; + } + + ret = dict_get_ptr(xattr, GF_XATTROP_INDEX_GFID, &index_gfid); + if (ret) + goto out; + + gf_msg_debug(this->name, 0, "index-dir gfid for %s: %s", subvol->name, + uuid_utoa(index_gfid)); + + ret = syncop_inode_find(this, subvol, index_gfid, inode, NULL, NULL); + +out: + loc_wipe(&rootloc); + + if (xattr) + dict_unref(xattr); + + return ret; +} + +int +ec_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name) +{ + loc_t loc = { + 0, + }; + int ret = 0; + + loc.parent = inode_ref(inode); + loc.name = name; + + ret = syncop_unlink(subvol, &loc, NULL, NULL); + + loc_wipe(&loc); + return ret; +} + +static gf_boolean_t +ec_is_heal_completed(char *status) +{ + char *bad_pos = NULL; + char *zero_pos = NULL; + + if (!status) { + return _gf_false; + } + + /*Logic: + * Status will be of the form Good: <binary>, Bad: <binary> + * If heal completes, if we do strchr for '0' it should be present after + * 'Bad:' i.e. strRchr for ':' + * */ + + zero_pos = strchr(status, '0'); + bad_pos = strrchr(status, ':'); + if (!zero_pos || !bad_pos) { + /*malformed status*/ + return _gf_false; + } + + if (zero_pos > bad_pos) { + return _gf_true; + } + + return _gf_false; +} + +int +ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc, + gf_boolean_t full) +{ + dict_t *xdata = NULL; + dict_t *dict = NULL; + uint32_t count; + int32_t ret; + char *heal_status = NULL; + ec_t *ec = healer->this->private; + + GF_ATOMIC_INC(ec->stats.shd.attempted); + ret = syncop_getxattr(healer->this, loc, &dict, EC_XATTR_HEAL, NULL, + &xdata); + if (ret == 0) { + if (dict && (dict_get_str(dict, EC_XATTR_HEAL, &heal_status) == 0)) { + if (ec_is_heal_completed(heal_status)) { + GF_ATOMIC_INC(ec->stats.shd.completed); + } + } + } + + if (!full && (loc->inode->ia_type == IA_IFDIR)) { + /* If we have just healed a directory, it's possible that + * other index entries have appeared to be healed. */ + if ((xdata != NULL) && + (dict_get_uint32(xdata, EC_XATTR_HEAL_NEW, &count) == 0) && + (count > 0)) { + /* Force a rerun of the index healer. */ + gf_msg_debug(healer->this->name, 0, "%d more entries to heal", + count); + + healer->rerun = _gf_true; + } + } + + if (xdata != NULL) { + dict_unref(xdata); + } + + if (dict) { + dict_unref(dict); + } + + return ret; +} + +int +ec_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) +{ + struct subvol_healer *healer = data; + ec_t *ec = NULL; + loc_t loc = {0}; + int ret = 0; + + ec = healer->this->private; + if (ec->xl_up_count <= ec->fragments) { + return -ENOTCONN; + } + if (!ec->shd.enabled) + return -EBUSY; + + gf_msg_debug(healer->this->name, 0, "got entry: %s", entry->d_name); + + ret = gf_uuid_parse(entry->d_name, loc.gfid); + if (ret) + return 0; + + /* If this fails with ENOENT/ESTALE index is stale */ + ret = syncop_gfid_to_path(healer->this->itable, subvol, loc.gfid, + (char **)&loc.path); + if (ret < 0) + goto out; + + ret = syncop_inode_find(healer->this, healer->this, loc.gfid, &loc.inode, + NULL, NULL); + if (ret < 0) + goto out; + + ec_shd_selfheal(healer, healer->subvol, &loc, _gf_false); +out: + if (ret == -ENOENT || ret == -ESTALE) { + gf_msg(healer->this->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_FAIL, + "Purging index for gfid %s:", uuid_utoa(loc.gfid)); + ec_shd_index_purge(subvol, parent->inode, entry->d_name); + } + loc_wipe(&loc); + + return 0; +} + +int +ec_shd_index_sweep(struct subvol_healer *healer) +{ + loc_t loc = {0}; + ec_t *ec = NULL; + int ret = 0; + xlator_t *subvol = NULL; + dict_t *xdata = NULL; + + ec = healer->this->private; + subvol = ec->xl_list[healer->subvol]; + + ret = ec_shd_index_inode(healer->this, subvol, &loc.inode); + if (ret < 0) { + gf_msg(healer->this->name, GF_LOG_WARNING, errno, + EC_MSG_INDEX_DIR_GET_FAIL, "unable to get index-dir on %s", + subvol->name); + goto out; + } + + xdata = dict_new(); + if (!xdata || dict_set_int32(xdata, "get-gfid-type", 1)) { + ret = -ENOMEM; + goto out; + } + + _mask_cancellation(); + ret = syncop_mt_dir_scan(NULL, subvol, &loc, GF_CLIENT_PID_SELF_HEALD, + healer, ec_shd_index_heal, xdata, + ec->shd.max_threads, ec->shd.wait_qlength); + _unmask_cancellation(); +out: + if (xdata) + dict_unref(xdata); + loc_wipe(&loc); + + return ret; +} + +int +ec_shd_full_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) +{ + struct subvol_healer *healer = data; + xlator_t *this = healer->this; + ec_t *ec = NULL; + loc_t loc = {0}; + int ret = 0; + + ec = this->private; + + if (this->cleanup_starting) { + return -ENOTCONN; + } + + if (ec->xl_up_count <= ec->fragments) { + return -ENOTCONN; + } + if (!ec->shd.enabled) + return -EBUSY; + + if (gf_uuid_is_null(entry->d_stat.ia_gfid)) { + /* It's possible that an entry has been removed just after + * being seen in a directory but before getting its stat info. + * In this case we'll receive a NULL gfid here. Since the file + * doesn't exist anymore, we can safely ignore it. */ + return 0; + } + + loc.parent = inode_ref(parent->inode); + loc.name = entry->d_name; + gf_uuid_copy(loc.gfid, entry->d_stat.ia_gfid); + + /* If this fails with ENOENT/ESTALE index is stale */ + ret = syncop_gfid_to_path(this->itable, subvol, loc.gfid, + (char **)&loc.path); + if (ret < 0) + goto out; + + ret = syncop_inode_find(this, this, loc.gfid, &loc.inode, NULL, NULL); + if (ret < 0) + goto out; + + ec_shd_selfheal(healer, healer->subvol, &loc, _gf_true); + + ret = 0; + +out: + loc_wipe(&loc); + return ret; +} + +int +ec_shd_full_sweep(struct subvol_healer *healer, inode_t *inode) +{ + ec_t *ec = NULL; + loc_t loc = {0}; + int ret = -1; + + ec = healer->this->private; + loc.inode = inode; + _mask_cancellation(); + ret = syncop_ftw(ec->xl_list[healer->subvol], &loc, + GF_CLIENT_PID_SELF_HEALD, healer, ec_shd_full_heal); + _unmask_cancellation(); + return ret; +} + +void * +ec_shd_index_healer(void *data) +{ + struct subvol_healer *healer = NULL; + xlator_t *this = NULL; + int run = 0; + + healer = data; + THIS = this = healer->this; + ec_t *ec = this->private; + + for (;;) { + run = ec_shd_healer_wait(healer); + if (run == -1) + break; + + if (ec->xl_up_count > ec->fragments) { + gf_msg_debug(this->name, 0, "starting index sweep on subvol %s", + ec_subvol_name(this, healer->subvol)); + ec_shd_index_sweep(healer); + } + gf_msg_debug(this->name, 0, "finished index sweep on subvol %s", + ec_subvol_name(this, healer->subvol)); + } + + return NULL; +} + +void * +ec_shd_full_healer(void *data) +{ + struct subvol_healer *healer = NULL; + xlator_t *this = NULL; + loc_t rootloc = {0}; + + int run = 0; + + healer = data; + THIS = this = healer->this; + ec_t *ec = this->private; + + rootloc.inode = this->itable->root; + for (;;) { + run = ec_shd_healer_wait(healer); + if (run < 0) { + break; + } else if (run == 0) { + continue; + } + + if (ec->xl_up_count > ec->fragments) { + gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_FULL_SWEEP_START, + "starting full sweep on subvol %s", + ec_subvol_name(this, healer->subvol)); + + ec_shd_selfheal(healer, healer->subvol, &rootloc, _gf_true); + ec_shd_full_sweep(healer, this->itable->root); + } + + gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_FULL_SWEEP_STOP, + "finished full sweep on subvol %s", + ec_subvol_name(this, healer->subvol)); + } + + return NULL; +} + +int +ec_shd_healer_init(xlator_t *this, struct subvol_healer *healer) +{ + int ret = 0; + + ret = pthread_mutex_init(&healer->mutex, NULL); + if (ret) + goto out; + + ret = pthread_cond_init(&healer->cond, NULL); + if (ret) + goto out; + + healer->this = this; + healer->running = _gf_false; + healer->rerun = _gf_false; +out: + return ret; +} + +int +ec_shd_healer_spawn(xlator_t *this, struct subvol_healer *healer, + void *(threadfn)(void *)) +{ + int ret = 0; + + pthread_mutex_lock(&healer->mutex); + { + if (healer->running) { + pthread_cond_signal(&healer->cond); + } else { + ret = gf_thread_create(&healer->thread, NULL, threadfn, healer, + "ecshd"); + if (ret) + goto unlock; + healer->running = 1; + } + + healer->rerun = 1; + } +unlock: + pthread_mutex_unlock(&healer->mutex); + + return ret; +} + +int +ec_shd_full_healer_spawn(xlator_t *this, int subvol) +{ + if (xlator_is_cleanup_starting(this)) + return -1; + + return ec_shd_healer_spawn(this, NTH_FULL_HEALER(this, subvol), + ec_shd_full_healer); +} + +int +ec_shd_index_healer_spawn(xlator_t *this, int subvol) +{ + if (xlator_is_cleanup_starting(this)) + return -1; + + return ec_shd_healer_spawn(this, NTH_INDEX_HEALER(this, subvol), + ec_shd_index_healer); +} + +void +ec_shd_index_healer_wake(ec_t *ec) +{ + int32_t i; + + for (i = 0; i < ec->nodes; i++) { + if (((ec->xl_up >> i) & 1) != 0) { + ec_shd_index_healer_spawn(ec->xl, i); + } + } +} + +int +ec_selfheal_daemon_init(xlator_t *this) +{ + ec_t *ec = NULL; + ec_self_heald_t *shd = NULL; + int ret = -1; + int i = 0; + + ec = this->private; + shd = &ec->shd; + + shd->index_healers = GF_CALLOC(sizeof(*shd->index_healers), ec->nodes, + ec_mt_subvol_healer_t); + if (!shd->index_healers) + goto out; + + for (i = 0; i < ec->nodes; i++) { + shd->index_healers[i].subvol = i; + ret = ec_shd_healer_init(this, &shd->index_healers[i]); + if (ret) + goto out; + } + + shd->full_healers = GF_CALLOC(sizeof(*shd->full_healers), ec->nodes, + ec_mt_subvol_healer_t); + if (!shd->full_healers) + goto out; + + for (i = 0; i < ec->nodes; i++) { + shd->full_healers[i].subvol = i; + ret = ec_shd_healer_init(this, &shd->full_healers[i]); + if (ret) + goto out; + } + + ret = 0; +out: + return ret; +} + +int +ec_heal_op(xlator_t *this, dict_t *output, gf_xl_afr_op_t op, int xl_id) +{ + char key[64] = {0}; + int op_ret = 0; + ec_t *ec = NULL; + int i = 0; + GF_UNUSED int ret = 0; + + ec = this->private; + + op_ret = -1; + for (i = 0; i < ec->nodes; i++) { + snprintf(key, sizeof(key), "%d-%d-status", xl_id, i); + + if (((ec->xl_up >> i) & 1) == 0) { + ret = dict_set_str(output, key, "Brick is not connected"); + } else if (!ec->up) { + ret = dict_set_str(output, key, "Disperse subvolume is not up"); + } else if (!ec_shd_is_subvol_local(this, i)) { + ret = dict_set_str(output, key, "Brick is remote"); + } else { + ret = dict_set_str(output, key, "Started self-heal"); + if (op == GF_SHD_OP_HEAL_FULL) { + ec_shd_full_healer_spawn(this, i); + } else if (op == GF_SHD_OP_HEAL_INDEX) { + ec_shd_index_healer_spawn(this, i); + } + op_ret = 0; + } + } + return op_ret; +} + +int +ec_xl_op(xlator_t *this, dict_t *input, dict_t *output) +{ + gf_xl_afr_op_t op = GF_SHD_OP_INVALID; + int ret = 0; + int xl_id = 0; + + ret = dict_get_int32(input, "xl-op", (int32_t *)&op); + if (ret) + goto out; + + ret = dict_get_int32(input, this->name, &xl_id); + if (ret) + goto out; + + ret = dict_set_int32(output, this->name, xl_id); + if (ret) + goto out; + + switch (op) { + case GF_SHD_OP_HEAL_FULL: + ret = ec_heal_op(this, output, op, xl_id); + break; + + case GF_SHD_OP_HEAL_INDEX: + ret = ec_heal_op(this, output, op, xl_id); + break; + + default: + ret = -1; + break; + } +out: + dict_del(output, this->name); + return ret; +} + +void +ec_destroy_healer_object(xlator_t *this, struct subvol_healer *healer) +{ + if (!healer) + return; + + pthread_cond_destroy(&healer->cond); + pthread_mutex_destroy(&healer->mutex); +} + +void +ec_selfheal_daemon_fini(xlator_t *this) +{ + struct subvol_healer *healer = NULL; + ec_self_heald_t *shd = NULL; + ec_t *priv = NULL; + int i = 0; + + priv = this->private; + if (!priv) + return; + + shd = &priv->shd; + if (!shd->iamshd) + return; + + for (i = 0; i < priv->nodes; i++) { + healer = &shd->index_healers[i]; + ec_destroy_healer_object(this, healer); + + healer = &shd->full_healers[i]; + ec_destroy_healer_object(this, healer); + } + + GF_FREE(shd->index_healers); + GF_FREE(shd->full_healers); +} diff --git a/xlators/cluster/ec/src/ec-heald.h b/xlators/cluster/ec/src/ec-heald.h new file mode 100644 index 00000000000..6c7da4edc10 --- /dev/null +++ b/xlators/cluster/ec/src/ec-heald.h @@ -0,0 +1,30 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_HEALD_H__ +#define __EC_HEALD_H__ + +#include "ec-types.h" // for ec_t +#include "glusterfs/dict.h" // for dict_t +#include "glusterfs/globals.h" // for xlator_t + +int +ec_xl_op(xlator_t *this, dict_t *input, dict_t *output); + +int +ec_selfheal_daemon_init(xlator_t *this); + +void +ec_shd_index_healer_wake(ec_t *ec); + +void +ec_selfheal_daemon_fini(xlator_t *this); + +#endif /* __EC_HEALD_H__ */ diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c new file mode 100644 index 00000000000..48f54475e01 --- /dev/null +++ b/xlators/cluster/ec/src/ec-helpers.c @@ -0,0 +1,867 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <libgen.h> + +#include <glusterfs/byte-order.h> + +#include "ec.h" +#include "ec-mem-types.h" +#include "ec-messages.h" +#include "ec-fops.h" +#include "ec-method.h" +#include "ec-helpers.h" + +static const char *ec_fop_list[] = {[-EC_FOP_HEAL] = "HEAL"}; + +const char * +ec_bin(char *str, size_t size, uint64_t value, int32_t digits) +{ + str += size; + + if (size-- < 1) { + goto failed; + } + *--str = 0; + + while ((value != 0) || (digits > 0)) { + if (size-- < 1) { + goto failed; + } + *--str = '0' + (value & 1); + digits--; + value >>= 1; + } + + return str; + +failed: + return "<buffer too small>"; +} + +const char * +ec_fop_name(int32_t id) +{ + if (id >= 0) { + return gf_fop_list[id]; + } + + return ec_fop_list[-id]; +} + +void +ec_trace(const char *event, ec_fop_data_t *fop, const char *fmt, ...) +{ + char str1[32], str2[32], str3[32]; + char *msg; + ec_t *ec = fop->xl->private; + va_list args; + int32_t ret; + + va_start(args, fmt); + ret = vasprintf(&msg, fmt, args); + va_end(args); + + if (ret < 0) { + msg = "<memory allocation error>"; + } + + gf_msg_trace("ec", 0, + "%s(%s) %p(%p) [refs=%d, winds=%d, jobs=%d] " + "frame=%p/%p, min/exp=%d/%d, err=%d state=%d " + "{%s:%s:%s} %s", + event, ec_fop_name(fop->id), fop, fop->parent, fop->refs, + fop->winds, fop->jobs, fop->req_frame, fop->frame, + fop->minimum, fop->expected, fop->error, fop->state, + ec_bin(str1, sizeof(str1), fop->mask, ec->nodes), + ec_bin(str2, sizeof(str2), fop->remaining, ec->nodes), + ec_bin(str3, sizeof(str3), fop->good, ec->nodes), msg); + + if (ret >= 0) { + free(msg); + } +} + +int32_t +ec_bits_consume(uint64_t *n) +{ + uint64_t tmp; + + tmp = *n; + tmp &= -tmp; + *n ^= tmp; + + return gf_bits_index(tmp); +} + +size_t +ec_iov_copy_to(void *dst, struct iovec *vector, int32_t count, off_t offset, + size_t size) +{ + int32_t i = 0; + size_t total = 0, len = 0; + + while (i < count) { + if (offset < vector[i].iov_len) { + while ((i < count) && (size > 0)) { + len = size; + if (len > vector[i].iov_len - offset) { + len = vector[i].iov_len - offset; + } + memcpy(dst, vector[i++].iov_base + offset, len); + offset = 0; + dst += len; + total += len; + size -= len; + } + + break; + } + + offset -= vector[i].iov_len; + i++; + } + + return total; +} + +int32_t +ec_buffer_alloc(xlator_t *xl, size_t size, struct iobref **piobref, void **ptr) +{ + struct iobref *iobref = NULL; + struct iobuf *iobuf = NULL; + int32_t ret = -ENOMEM; + + iobuf = iobuf_get_page_aligned(xl->ctx->iobuf_pool, size, + EC_METHOD_WORD_SIZE); + if (iobuf == NULL) { + goto out; + } + + iobref = *piobref; + if (iobref == NULL) { + iobref = iobref_new(); + if (iobref == NULL) { + goto out; + } + } + + ret = iobref_add(iobref, iobuf); + if (ret != 0) { + if (iobref != *piobref) { + iobref_unref(iobref); + } + iobref = NULL; + + goto out; + } + + GF_ASSERT(EC_ALIGN_CHECK(iobuf->ptr, EC_METHOD_WORD_SIZE)); + + *ptr = iobuf->ptr; + +out: + if (iobuf != NULL) { + iobuf_unref(iobuf); + } + + if (iobref != NULL) { + *piobref = iobref; + } + + return ret; +} + +int32_t +ec_dict_set_array(dict_t *dict, char *key, uint64_t value[], int32_t size) +{ + int ret = -1; + uint64_t *ptr = NULL; + int32_t vindex; + + if (value == NULL) { + return -EINVAL; + } + + ptr = GF_MALLOC(sizeof(uint64_t) * size, gf_common_mt_char); + if (ptr == NULL) { + return -ENOMEM; + } + for (vindex = 0; vindex < size; vindex++) { + ptr[vindex] = hton64(value[vindex]); + } + ret = dict_set_bin(dict, key, ptr, sizeof(uint64_t) * size); + if (ret) + GF_FREE(ptr); + return ret; +} + +int32_t +ec_dict_get_array(dict_t *dict, char *key, uint64_t value[], int32_t size) +{ + void *ptr; + int32_t len; + int32_t vindex; + int32_t old_size = 0; + int32_t err; + + if (dict == NULL) { + return -EINVAL; + } + err = dict_get_ptr_and_len(dict, key, &ptr, &len); + if (err != 0) { + return err; + } + + if (len > (size * sizeof(uint64_t)) || (len % sizeof(uint64_t))) { + return -EINVAL; + } + + /* 3.6 version ec would have stored version in 64 bit. In that case treat + * metadata versions same as data*/ + old_size = min(size, len / sizeof(uint64_t)); + for (vindex = 0; vindex < old_size; vindex++) { + value[vindex] = ntoh64(*((uint64_t *)ptr + vindex)); + } + + if (old_size < size) { + for (vindex = old_size; vindex < size; vindex++) { + value[vindex] = value[old_size - 1]; + } + } + + return 0; +} + +int32_t +ec_dict_del_array(dict_t *dict, char *key, uint64_t value[], int32_t size) +{ + int ret = 0; + + ret = ec_dict_get_array(dict, key, value, size); + if (ret == 0) + dict_del(dict, key); + + return ret; +} + +int32_t +ec_dict_set_number(dict_t *dict, char *key, uint64_t value) +{ + int ret = -1; + uint64_t *ptr; + + ptr = GF_MALLOC(sizeof(value), gf_common_mt_char); + if (ptr == NULL) { + return -ENOMEM; + } + + *ptr = hton64(value); + + ret = dict_set_bin(dict, key, ptr, sizeof(value)); + if (ret) + GF_FREE(ptr); + + return ret; +} + +int32_t +ec_dict_del_number(dict_t *dict, char *key, uint64_t *value) +{ + void *ptr; + int32_t len, err; + + if (dict == NULL) { + return -EINVAL; + } + err = dict_get_ptr_and_len(dict, key, &ptr, &len); + if (err != 0) { + return err; + } + if (len != sizeof(uint64_t)) { + return -EINVAL; + } + + *value = ntoh64(*(uint64_t *)ptr); + + dict_del(dict, key); + + return 0; +} + +int32_t +ec_dict_set_config(dict_t *dict, char *key, ec_config_t *config) +{ + int ret = -1; + uint64_t *ptr, data; + + if (config->version > EC_CONFIG_VERSION) { + gf_msg("ec", GF_LOG_ERROR, EINVAL, EC_MSG_UNSUPPORTED_VERSION, + "Trying to store an unsupported config " + "version (%u)", + config->version); + + return -EINVAL; + } + + ptr = GF_MALLOC(sizeof(uint64_t), gf_common_mt_char); + if (ptr == NULL) { + return -ENOMEM; + } + + data = ((uint64_t)config->version) << 56; + data |= ((uint64_t)config->algorithm) << 48; + data |= ((uint64_t)config->gf_word_size) << 40; + data |= ((uint64_t)config->bricks) << 32; + data |= ((uint64_t)config->redundancy) << 24; + data |= config->chunk_size; + + *ptr = hton64(data); + + ret = dict_set_bin(dict, key, ptr, sizeof(uint64_t)); + if (ret) + GF_FREE(ptr); + + return ret; +} + +int32_t +ec_dict_del_config(dict_t *dict, char *key, ec_config_t *config) +{ + void *ptr; + uint64_t data; + int32_t len, err; + + if (dict == NULL) { + return -EINVAL; + } + err = dict_get_ptr_and_len(dict, key, &ptr, &len); + if (err != 0) { + return err; + } + if (len != sizeof(uint64_t)) { + return -EINVAL; + } + + data = ntoh64(*(uint64_t *)ptr); + /* Currently we need to get the config xattr for entries of type IA_INVAL. + * These entries can later become IA_DIR entries (after inode_link()), + * which don't have a config xattr. However, since the xattr is requested + * using an xattrop() fop, it will always return a config full of 0's + * instead of saying that it doesn't exist. + * + * We need to filter out this case and consider that a config xattr == 0 is + * the same as a non-existent xattr. Otherwise ec_config_check() will fail. + */ + if (data == 0) { + return -ENODATA; + } + + config->version = (data >> 56) & 0xff; + if (config->version > EC_CONFIG_VERSION) { + gf_msg("ec", GF_LOG_ERROR, EINVAL, EC_MSG_UNSUPPORTED_VERSION, + "Found an unsupported config version (%u)", config->version); + + return -EINVAL; + } + + config->algorithm = (data >> 48) & 0xff; + config->gf_word_size = (data >> 40) & 0xff; + config->bricks = (data >> 32) & 0xff; + config->redundancy = (data >> 24) & 0xff; + config->chunk_size = data & 0xffffff; + + dict_del(dict, key); + + return 0; +} + +gf_boolean_t +ec_loc_gfid_check(xlator_t *xl, uuid_t dst, uuid_t src) +{ + if (gf_uuid_is_null(src)) { + return _gf_true; + } + + if (gf_uuid_is_null(dst)) { + gf_uuid_copy(dst, src); + + return _gf_true; + } + + if (gf_uuid_compare(dst, src) != 0) { + gf_msg(xl->name, GF_LOG_WARNING, 0, EC_MSG_GFID_MISMATCH, + "Mismatching GFID's in loc"); + + return _gf_false; + } + + return _gf_true; +} + +int32_t +ec_loc_setup_inode(xlator_t *xl, inode_table_t *table, loc_t *loc) +{ + int32_t ret = -EINVAL; + + if (loc->inode != NULL) { + if (!ec_loc_gfid_check(xl, loc->gfid, loc->inode->gfid)) { + goto out; + } + } else if (table != NULL) { + if (!gf_uuid_is_null(loc->gfid)) { + loc->inode = inode_find(table, loc->gfid); + } else if (loc->path && strchr(loc->path, '/')) { + loc->inode = inode_resolve(table, (char *)loc->path); + } + } + + ret = 0; + +out: + return ret; +} + +int32_t +ec_loc_setup_parent(xlator_t *xl, inode_table_t *table, loc_t *loc) +{ + char *path, *parent; + int32_t ret = -EINVAL; + + if (loc->parent != NULL) { + if (!ec_loc_gfid_check(xl, loc->pargfid, loc->parent->gfid)) { + goto out; + } + } else if (table != NULL) { + if (!gf_uuid_is_null(loc->pargfid)) { + loc->parent = inode_find(table, loc->pargfid); + } else if (loc->path && strchr(loc->path, '/')) { + path = gf_strdup(loc->path); + if (path == NULL) { + gf_msg(xl->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Unable to duplicate path '%s'", loc->path); + + ret = -ENOMEM; + + goto out; + } + parent = dirname(path); + loc->parent = inode_resolve(table, parent); + if (loc->parent != NULL) { + gf_uuid_copy(loc->pargfid, loc->parent->gfid); + } + GF_FREE(path); + } + } + + /* If 'pargfid' has not been determined, clear 'name' to avoid resolutions + based on <gfid:pargfid>/name. */ + if (gf_uuid_is_null(loc->pargfid)) { + loc->name = NULL; + } + + ret = 0; + +out: + return ret; +} + +int32_t +ec_loc_setup_path(xlator_t *xl, loc_t *loc) +{ + static uuid_t root = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + char *name; + int32_t ret = -EINVAL; + + if (loc->path != NULL) { + name = strrchr(loc->path, '/'); + if (name == NULL) { + /* Allow gfid paths: <gfid:...> */ + if (strncmp(loc->path, "<gfid:", 6) == 0) { + ret = 0; + } + goto out; + } + if (name == loc->path) { + if (name[1] == 0) { + if (!ec_loc_gfid_check(xl, loc->gfid, root)) { + goto out; + } + } else { + if (!ec_loc_gfid_check(xl, loc->pargfid, root)) { + goto out; + } + } + } + name++; + + if (loc->name != NULL) { + if (strcmp(loc->name, name) != 0) { + gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_LOC_NAME, + "Invalid name '%s' in loc", loc->name); + + goto out; + } + } else { + loc->name = name; + } + } + + ret = 0; + +out: + return ret; +} + +int32_t +ec_loc_parent(xlator_t *xl, loc_t *loc, loc_t *parent) +{ + inode_table_t *table = NULL; + char *str = NULL; + int32_t ret = -ENOMEM; + + memset(parent, 0, sizeof(loc_t)); + + if (loc->parent != NULL) { + table = loc->parent->table; + parent->inode = inode_ref(loc->parent); + } else if (loc->inode != NULL) { + table = loc->inode->table; + } + if (!gf_uuid_is_null(loc->pargfid)) { + gf_uuid_copy(parent->gfid, loc->pargfid); + } + if (loc->path && strchr(loc->path, '/')) { + str = gf_strdup(loc->path); + if (str == NULL) { + gf_msg(xl->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Unable to duplicate path '%s'", loc->path); + + goto out; + } + parent->path = gf_strdup(dirname(str)); + if (parent->path == NULL) { + gf_msg(xl->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Unable to duplicate path '%s'", dirname(str)); + + goto out; + } + } + + ret = ec_loc_setup_path(xl, parent); + if (ret == 0) { + ret = ec_loc_setup_inode(xl, table, parent); + } + if (ret == 0) { + ret = ec_loc_setup_parent(xl, table, parent); + } + if (ret != 0) { + goto out; + } + + if ((parent->inode == NULL) && (parent->path == NULL) && + gf_uuid_is_null(parent->gfid)) { + gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_LOC_PARENT_INODE_MISSING, + "Parent inode missing for loc_t"); + + ret = -EINVAL; + + goto out; + } + + ret = 0; + +out: + GF_FREE(str); + + if (ret != 0) { + loc_wipe(parent); + } + + return ret; +} + +int32_t +ec_loc_update(xlator_t *xl, loc_t *loc, inode_t *inode, struct iatt *iatt) +{ + inode_table_t *table = NULL; + int32_t ret = -EINVAL; + + if (inode != NULL) { + table = inode->table; + if (loc->inode != inode) { + if (loc->inode != NULL) { + inode_unref(loc->inode); + } + loc->inode = inode_ref(inode); + gf_uuid_copy(loc->gfid, inode->gfid); + } + } else if (loc->inode != NULL) { + table = loc->inode->table; + } else if (loc->parent != NULL) { + table = loc->parent->table; + } + + if (iatt != NULL) { + if (!ec_loc_gfid_check(xl, loc->gfid, iatt->ia_gfid)) { + goto out; + } + } + + ret = ec_loc_setup_path(xl, loc); + if (ret == 0) { + ret = ec_loc_setup_inode(xl, table, loc); + } + if (ret == 0) { + ret = ec_loc_setup_parent(xl, table, loc); + } + if (ret != 0) { + goto out; + } + +out: + return ret; +} + +int32_t +ec_loc_from_fd(xlator_t *xl, loc_t *loc, fd_t *fd) +{ + ec_fd_t *ctx; + int32_t ret = -ENOMEM; + + memset(loc, 0, sizeof(*loc)); + + ctx = ec_fd_get(fd, xl); + if (ctx != NULL) { + if (loc_copy(loc, &ctx->loc) != 0) { + goto out; + } + } + + ret = ec_loc_update(xl, loc, fd->inode, NULL); + if (ret != 0) { + goto out; + } + +out: + if (ret != 0) { + loc_wipe(loc); + } + + return ret; +} + +int32_t +ec_loc_from_loc(xlator_t *xl, loc_t *dst, loc_t *src) +{ + int32_t ret = -ENOMEM; + + memset(dst, 0, sizeof(*dst)); + + if (loc_copy(dst, src) != 0) { + goto out; + } + + ret = ec_loc_update(xl, dst, NULL, NULL); + if (ret != 0) { + goto out; + } + +out: + if (ret != 0) { + loc_wipe(dst); + } + + return ret; +} + +void +ec_owner_set(call_frame_t *frame, void *owner) +{ + set_lk_owner_from_ptr(&frame->root->lk_owner, owner); +} + +void +ec_owner_copy(call_frame_t *frame, gf_lkowner_t *owner) +{ + lk_owner_copy(&frame->root->lk_owner, owner); +} + +static void +ec_stripe_cache_init(ec_t *ec, ec_inode_t *ctx) +{ + ec_stripe_list_t *stripe_cache = NULL; + + stripe_cache = &(ctx->stripe_cache); + if (stripe_cache->max == 0) { + stripe_cache->max = ec->stripe_cache; + } +} + +ec_inode_t * +__ec_inode_get(inode_t *inode, xlator_t *xl) +{ + ec_inode_t *ctx = NULL; + uint64_t value = 0; + + if ((__inode_ctx_get(inode, xl, &value) != 0) || (value == 0)) { + ctx = GF_MALLOC(sizeof(*ctx), ec_mt_ec_inode_t); + if (ctx != NULL) { + memset(ctx, 0, sizeof(*ctx)); + INIT_LIST_HEAD(&ctx->heal); + INIT_LIST_HEAD(&ctx->stripe_cache.lru); + ctx->heal_count = 0; + value = (uint64_t)(uintptr_t)ctx; + if (__inode_ctx_set(inode, xl, &value) != 0) { + GF_FREE(ctx); + + return NULL; + } + } + } else { + ctx = (ec_inode_t *)(uintptr_t)value; + } + if (ctx) + ec_stripe_cache_init(xl->private, ctx); + + return ctx; +} + +ec_inode_t * +ec_inode_get(inode_t *inode, xlator_t *xl) +{ + ec_inode_t *ctx = NULL; + + LOCK(&inode->lock); + + ctx = __ec_inode_get(inode, xl); + + UNLOCK(&inode->lock); + + return ctx; +} + +ec_fd_t * +__ec_fd_get(fd_t *fd, xlator_t *xl) +{ + int i = 0; + ec_fd_t *ctx = NULL; + ec_inode_t *ictx = NULL; + uint64_t value = 0; + ec_t *ec = xl->private; + + if ((__fd_ctx_get(fd, xl, &value) != 0) || (value == 0)) { + ctx = GF_MALLOC(sizeof(*ctx) + (sizeof(ec_fd_status_t) * ec->nodes), + ec_mt_ec_fd_t); + if (ctx != NULL) { + memset(ctx, 0, sizeof(*ctx)); + + for (i = 0; i < ec->nodes; i++) { + if (fd_is_anonymous(fd)) { + ctx->fd_status[i] = EC_FD_OPENED; + } else { + ctx->fd_status[i] = EC_FD_NOT_OPENED; + } + } + + value = (uint64_t)(uintptr_t)ctx; + if (__fd_ctx_set(fd, xl, value) != 0) { + GF_FREE(ctx); + return NULL; + } + /* Only refering bad-version so no need for lock + * */ + ictx = __ec_inode_get(fd->inode, xl); + if (ictx) { + ctx->bad_version = ictx->bad_version; + } + } + } else { + ctx = (ec_fd_t *)(uintptr_t)value; + } + + /* Treat anonymous fd specially */ + if (fd->anonymous && ctx) { + /* Mark the fd open for all subvolumes. */ + ctx->open = -1; + /* Try to populate ctx->loc with fd->inode information. */ + ec_loc_update(xl, &ctx->loc, fd->inode, NULL); + } + + return ctx; +} + +ec_fd_t * +ec_fd_get(fd_t *fd, xlator_t *xl) +{ + ec_fd_t *ctx = NULL; + + LOCK(&fd->lock); + + ctx = __ec_fd_get(fd, xl); + + UNLOCK(&fd->lock); + + return ctx; +} + +gf_boolean_t +ec_is_internal_xattr(dict_t *dict, char *key, data_t *value, void *data) +{ + if (key && (strncmp(key, EC_XATTR_PREFIX, SLEN(EC_XATTR_PREFIX)) == 0)) + return _gf_true; + + return _gf_false; +} + +void +ec_filter_internal_xattrs(dict_t *xattr) +{ + dict_foreach_match(xattr, ec_is_internal_xattr, NULL, + dict_remove_foreach_fn, NULL); +} + +gf_boolean_t +ec_is_data_fop(glusterfs_fop_t fop) +{ + switch (fop) { + case GF_FOP_WRITE: + case GF_FOP_TRUNCATE: + case GF_FOP_FTRUNCATE: + case GF_FOP_FALLOCATE: + case GF_FOP_DISCARD: + case GF_FOP_ZEROFILL: + return _gf_true; + default: + return _gf_false; + } + return _gf_false; +} +/* +gf_boolean_t +ec_is_metadata_fop (int32_t lock_kind, glusterfs_fop_t fop) +{ + if (lock_kind == EC_LOCK_ENTRY) { + return _gf_false; + } + + switch (fop) { + case GF_FOP_SETATTR: + case GF_FOP_FSETATTR: + case GF_FOP_SETXATTR: + case GF_FOP_FSETXATTR: + case GF_FOP_REMOVEXATTR: + case GF_FOP_FREMOVEXATTR: + return _gf_true; + default: + return _gf_false; + } + return _gf_false; +}*/ diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h new file mode 100644 index 00000000000..015db514e05 --- /dev/null +++ b/xlators/cluster/ec/src/ec-helpers.h @@ -0,0 +1,200 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_HELPERS_H__ +#define __EC_HELPERS_H__ + +#include "ec-types.h" + +#define EC_ERR(_x) ((void *)-(intptr_t)(_x)) +#define EC_IS_ERR(_x) (((uintptr_t)(_x) & ~0xfffULL) == ~0xfffULL) +#define EC_GET_ERR(_x) ((int32_t)(intptr_t)(_x)) + +#define EC_ALIGN_CHECK(_ptr, _align) ((((uintptr_t)(_ptr)) & ((_align)-1)) == 0) + +const char * +ec_bin(char *str, size_t size, uint64_t value, int32_t digits); +const char * +ec_fop_name(int32_t id); +void +ec_trace(const char *event, ec_fop_data_t *fop, const char *fmt, ...); +int32_t +ec_bits_consume(uint64_t *n); +size_t +ec_iov_copy_to(void *dst, struct iovec *vector, int32_t count, off_t offset, + size_t size); +int32_t +ec_buffer_alloc(xlator_t *xl, size_t size, struct iobref **piobref, void **ptr); +int32_t +ec_dict_set_array(dict_t *dict, char *key, uint64_t *value, int32_t size); +int32_t +ec_dict_get_array(dict_t *dict, char *key, uint64_t value[], int32_t size); + +int32_t +ec_dict_del_array(dict_t *dict, char *key, uint64_t *value, int32_t size); +int32_t +ec_dict_set_number(dict_t *dict, char *key, uint64_t value); +int32_t +ec_dict_del_number(dict_t *dict, char *key, uint64_t *value); +int32_t +ec_dict_set_config(dict_t *dict, char *key, ec_config_t *config); +int32_t +ec_dict_del_config(dict_t *dict, char *key, ec_config_t *config); + +int32_t +ec_loc_parent(xlator_t *xl, loc_t *loc, loc_t *parent); +int32_t +ec_loc_update(xlator_t *xl, loc_t *loc, inode_t *inode, struct iatt *iatt); + +int32_t +ec_loc_from_fd(xlator_t *xl, loc_t *loc, fd_t *fd); +int32_t +ec_loc_from_loc(xlator_t *xl, loc_t *dst, loc_t *src); + +void +ec_owner_set(call_frame_t *frame, void *owner); +void +ec_owner_copy(call_frame_t *frame, gf_lkowner_t *owner); + +ec_inode_t * +__ec_inode_get(inode_t *inode, xlator_t *xl); +ec_inode_t * +ec_inode_get(inode_t *inode, xlator_t *xl); +ec_fd_t * +__ec_fd_get(fd_t *fd, xlator_t *xl); +ec_fd_t * +ec_fd_get(fd_t *fd, xlator_t *xl); + +static inline uint32_t +ec_adjust_size_down(ec_t *ec, uint64_t *value, gf_boolean_t scale) +{ + uint64_t head, tmp; + + tmp = *value; + head = tmp % ec->stripe_size; + tmp -= head; + + if (scale) { + tmp /= ec->fragments; + } + + *value = tmp; + + return (uint32_t)head; +} + +/* This function can cause an overflow if the passed value is too near to the + * uint64_t limit. If this happens, it returns the tail in negative form and + * the value is set to UINT64_MAX. */ +static inline int32_t +ec_adjust_size_up(ec_t *ec, uint64_t *value, gf_boolean_t scale) +{ + uint64_t tmp; + int32_t tail; + + tmp = *value; + /* We first adjust the value down. This never causes overflow. */ + tail = ec_adjust_size_down(ec, &tmp, scale); + + /* If the value was already aligned, tail will be 0 and nothing else + * needs to be done. */ + if (tail != 0) { + /* Otherwise, we need to compute the real tail and adjust the + * returned value to the next stripe. */ + tail = ec->stripe_size - tail; + if (scale) { + tmp += ec->fragment_size; + } else { + tmp += ec->stripe_size; + /* If no scaling is requested there's a possibility of + * overflow. */ + if (tmp < ec->stripe_size) { + tmp = UINT64_MAX; + tail = -tail; + } + } + } + + *value = tmp; + + return tail; +} + +/* This function is equivalent to ec_adjust_size_down() but with a potentially + * different parameter size (off_t vs uint64_t). */ +static inline uint32_t +ec_adjust_offset_down(ec_t *ec, off_t *value, gf_boolean_t scale) +{ + off_t head, tmp; + + tmp = *value; + head = tmp % ec->stripe_size; + tmp -= head; + + if (scale) { + tmp /= ec->fragments; + } + + *value = tmp; + + return (uint32_t)head; +} + +/* This function is equivalent to ec_adjust_size_up() but with a potentially + * different parameter size (off_t vs uint64_t). */ +static inline int32_t +ec_adjust_offset_up(ec_t *ec, off_t *value, gf_boolean_t scale) +{ + uint64_t tail, tmp; + + /* An offset is a signed type that can only have positive values, so + * we take advantage of this to avoid overflows. We simply convert it + * to an unsigned integer and operate normally. This won't cause an + * overflow. Overflow is only checked when converting back to an + * off_t. */ + tmp = *value; + tail = ec->stripe_size; + tail -= (tmp + tail - 1) % tail + 1; + tmp += tail; + if (scale) { + /* If we are scaling, we'll never get an overflow. */ + tmp /= ec->fragments; + } else { + /* Check if there has been an overflow. */ + if ((off_t)tmp < 0) { + tmp = GF_OFF_MAX; + tail = -tail; + } + } + + *value = (off_t)tmp; + + return (int32_t)tail; +} + +static inline int32_t +ec_is_power_of_2(uint32_t value) +{ + return (value != 0) && ((value & (value - 1)) == 0); +} + +gf_boolean_t +ec_is_internal_xattr(dict_t *dict, char *key, data_t *value, void *data); + +void +ec_filter_internal_xattrs(dict_t *xattr); + +gf_boolean_t +ec_is_data_fop(glusterfs_fop_t fop); + +int32_t +ec_launch_replace_heal(ec_t *ec); + +#endif /* __EC_HELPERS_H__ */ diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c new file mode 100644 index 00000000000..dad5f4d7018 --- /dev/null +++ b/xlators/cluster/ec/src/ec-inode-read.c @@ -0,0 +1,2046 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "ec.h" +#include "ec-messages.h" +#include "ec-helpers.h" +#include "ec-common.h" +#include "ec-combine.h" +#include "ec-method.h" +#include "ec-fops.h" + +/* FOP: access */ + +int32_t +ec_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_ACCESS, idx, op_ret, + op_errno); + if (cbk) { + if (xdata) + cbk->xdata = dict_ref(xdata); + ec_combine(cbk, NULL); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_access(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_access_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->access, + &fop->loc[0], fop->int32, fop->xdata); +} + +int32_t +ec_manager_access(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk = NULL; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_LOCK: + ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0, + EC_RANGE_FULL); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_one(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + if (ec_dispatch_one_retry(fop, NULL)) { + return EC_STATE_DISPATCH; + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + GF_ASSERT(cbk); + if (fop->cbks.access != NULL) { + if (cbk) { + fop->cbks.access(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, cbk->xdata); + } + } + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + if (fop->cbks.access != NULL) { + fop->cbks.access(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL); + } + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_access(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_access_cbk_t func, void *data, loc_t *loc, + int32_t mask, dict_t *xdata) +{ + ec_cbk_t callback = {.access = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(ACCESS) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_ACCESS, EC_FLAG_LOCK_SHARED, + target, fop_flags, ec_wind_access, + ec_manager_access, callback, data); + if (fop == NULL) { + goto out; + } + + fop->int32 = mask; + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL); + } +} + +/* FOP: getxattr */ + +int32_t +ec_combine_getxattr(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src) +{ + if (!ec_dict_compare(dst->dict, src->dict)) { + gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_DICT_MISMATCH, + "Mismatching dictionary in " + "answers of 'GF_FOP_GETXATTR'"); + + return 0; + } + + return 1; +} + +int32_t +ec_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_GETXATTR, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (op_ret >= 0) { + if (dict != NULL) { + cbk->dict = dict_ref(dict); + if (cbk->dict == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + } + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + ec_combine(cbk, ec_combine_getxattr); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_getxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_getxattr_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->getxattr, + &fop->loc[0], fop->str[0], fop->xdata); +} + +void +ec_handle_special_xattrs(ec_fop_data_t *fop) +{ + ec_cbk_data_t *cbk = NULL; + /* Stime may not be available on all the bricks, so even if some of the + * subvols succeed the operation, treat it as answer.*/ + if (fop->str[0] && fnmatch(GF_XATTR_STIME_PATTERN, fop->str[0], 0) == 0) { + if (!fop->answer || (fop->answer->op_ret < 0)) { + list_for_each_entry(cbk, &fop->cbk_list, list) + { + if (cbk->op_ret >= 0) { + fop->answer = cbk; + break; + } + } + } + } +} + +int32_t +ec_manager_getxattr(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_LOCK: + /* clear-locks commands must be done without any locks acquired + to avoid interferences. */ + if ((fop->str[0] == NULL) || + (strncmp(fop->str[0], GF_XATTR_CLRLK_CMD, + SLEN(GF_XATTR_CLRLK_CMD)) != 0)) { + if (fop->fd == NULL) { + ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0, + EC_RANGE_FULL); + } else { + ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0, + EC_RANGE_FULL); + } + ec_lock(fop); + } + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + if (fop->minimum == EC_MINIMUM_ALL) { + ec_dispatch_all(fop); + } else { + ec_dispatch_one(fop); + } + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + ec_handle_special_xattrs(fop); + if (fop->minimum == EC_MINIMUM_ALL) { + cbk = ec_fop_prepare_answer(fop, _gf_true); + } else { + if (ec_dispatch_one_retry(fop, &cbk)) { + return EC_STATE_DISPATCH; + } + } + if (cbk != NULL) { + int32_t err; + + err = ec_dict_combine(cbk, EC_COMBINE_DICT); + if (!ec_cbk_set_error(cbk, -err, _gf_true)) { + if (cbk->xdata != NULL) + ec_filter_internal_xattrs(cbk->xdata); + + if (cbk->dict != NULL) + ec_filter_internal_xattrs(cbk->dict); + } + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.getxattr != NULL) { + fop->cbks.getxattr(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, cbk->dict, cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.getxattr != NULL) { + fop->cbks.getxattr(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +int32_t +ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl, + int32_t op_ret, int32_t op_errno, uintptr_t mask, + uintptr_t good, uintptr_t bad, uint32_t pending, + dict_t *xdata) +{ + fop_getxattr_cbk_t func = cookie; + ec_t *ec = xl->private; + dict_t *dict = NULL; + char *str; + char bin1[65], bin2[65]; + + /* We try to return the 'pending' information in xdata, but if this cannot + * be set, we will ignore it silently. We prefer to report the success or + * failure of the heal itself. */ + if (xdata == NULL) { + xdata = dict_new(); + } else { + dict_ref(xdata); + } + if (xdata != NULL) { + if (dict_set_uint32(xdata, EC_XATTR_HEAL_NEW, pending) != 0) { + /* dict_set_uint32() is marked as 'warn_unused_result' and gcc + * enforces to check the result in this case. However we don't + * really care if it succeeded or not. We'll just do the same. + * + * This empty 'if' avoids the warning, and it will be removed by + * the optimizer. */ + } + } + + if (op_ret >= 0) { + dict = dict_new(); + if (dict == NULL) { + op_ret = -1; + op_errno = ENOMEM; + } else { + if (gf_asprintf(&str, "Good: %s, Bad: %s", + ec_bin(bin1, sizeof(bin1), good, ec->nodes), + ec_bin(bin2, sizeof(bin2), mask & ~(good | bad), + ec->nodes)) < 0) { + dict_unref(dict); + dict = NULL; + + op_ret = -1; + op_errno = ENOMEM; + + goto out; + } + + if (dict_set_dynstr(dict, EC_XATTR_HEAL, str) != 0) { + GF_FREE(str); + dict_unref(dict); + dict = NULL; + + op_ret = -1; + op_errno = ENOMEM; + + goto out; + } + } + } + +out: + func(frame, NULL, xl, op_ret, op_errno, dict, xdata); + + if (dict != NULL) { + dict_unref(dict); + } + if (xdata != NULL) { + dict_unref(xdata); + } + + return 0; +} + +void +ec_getxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_getxattr_cbk_t func, void *data, loc_t *loc, + const char *name, dict_t *xdata) +{ + ec_cbk_t callback = {.getxattr = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(GETXATTR) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + /* Special handling of an explicit self-heal request */ + if ((name != NULL) && (strcmp(name, EC_XATTR_HEAL) == 0)) { + ec_heal(frame, this, target, EC_MINIMUM_ONE, ec_getxattr_heal_cbk, func, + loc, 0, NULL); + + return; + } + + fop = ec_fop_data_allocate( + frame, this, GF_FOP_GETXATTR, EC_FLAG_LOCK_SHARED, target, fop_flags, + ec_wind_getxattr, ec_manager_getxattr, callback, data); + if (fop == NULL) { + goto out; + } + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (name != NULL) { + /* In case of list-node-uuids xattr, set flag to indicate + * the same and use node-uuid xattr for winding fop */ + if (XATTR_IS_NODE_UUID_LIST(name)) { + fop->int32 = 1; + fop->str[0] = gf_strdup(GF_XATTR_NODE_UUID_KEY); + } else { + fop->str[0] = gf_strdup(name); + } + if (fop->str[0] == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to duplicate a string."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL); + } +} + +/* FOP: fgetxattr */ + +int32_t +ec_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FGETXATTR, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (op_ret >= 0) { + if (dict != NULL) { + cbk->dict = dict_ref(dict); + if (cbk->dict == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + } + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + ec_combine(cbk, ec_combine_getxattr); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_fgetxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_fgetxattr_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->fgetxattr, + fop->fd, fop->str[0], fop->xdata); +} + +void +ec_fgetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fgetxattr_cbk_t func, void *data, fd_t *fd, + const char *name, dict_t *xdata) +{ + ec_cbk_t callback = {.fgetxattr = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(FGETXATTR) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate( + frame, this, GF_FOP_FGETXATTR, EC_FLAG_LOCK_SHARED, target, fop_flags, + ec_wind_fgetxattr, ec_manager_getxattr, callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (name != NULL) { + fop->str[0] = gf_strdup(name); + if (fop->str[0] == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to duplicate a string."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL); + } +} + +/* FOP: open */ + +int32_t +ec_combine_open(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src) +{ + if (dst->fd != src->fd) { + gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_FD_MISMATCH, + "Mismatching fd in answers " + "of 'GF_FOP_OPEN': %p <-> %p", + dst->fd, src->fd); + + return 0; + } + + return 1; +} + +int32_t +ec_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_OPEN, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (op_ret >= 0) { + if (fd != NULL) { + cbk->fd = fd_ref(fd); + if (cbk->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, + EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + } + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + ec_combine(cbk, ec_combine_open); + + ec_update_fd_status(fd, this, idx, op_ret); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_open(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_open_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->open, + &fop->loc[0], fop->int32, fop->fd, fop->xdata); +} + +int32_t +ec_open_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + int32_t error = 0; + + fop = fop->data; + if (op_ret >= 0) { + fop->answer->iatt[0] = *postbuf; + } else { + error = op_errno; + } + + ec_resume(fop, error); + + return 0; +} + +int32_t +ec_manager_open(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + ec_fd_t *ctx; + int32_t err; + + switch (state) { + case EC_STATE_INIT: + LOCK(&fop->fd->lock); + + ctx = __ec_fd_get(fop->fd, fop->xl); + if (ctx == NULL) { + UNLOCK(&fop->fd->lock); + + fop->error = ENOMEM; + + return EC_STATE_REPORT; + } + if (!ctx->loc.inode) { + err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]); + if (err != 0) { + UNLOCK(&fop->fd->lock); + + fop->error = -err; + + return EC_STATE_REPORT; + } + } + + ctx->flags = fop->int32; + + UNLOCK(&fop->fd->lock); + + /* We need to write to specific offsets on the bricks, so we + need to remove O_APPEND from flags (if present). + If O_TRUNC is specified, we remove it from open and an + ftruncate will be executed later, which will correctly update + the file size taking appropriate locks. O_TRUNC flag is saved + into fop->uint32 to use it later.*/ + fop->uint32 = fop->int32 & O_TRUNC; + fop->int32 &= ~(O_APPEND | O_TRUNC); + + /* Fall through */ + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_true); + if (cbk != NULL) { + int32_t err; + + err = ec_loc_update(fop->xl, &fop->loc[0], cbk->fd->inode, + NULL); + if (!ec_cbk_set_error(cbk, -err, _gf_true)) { + LOCK(&fop->fd->lock); + + ctx = __ec_fd_get(fop->fd, fop->xl); + if (ctx != NULL) { + ctx->open |= cbk->mask; + } + + UNLOCK(&fop->fd->lock); + + /* If O_TRUNC was specified, call ftruncate to + effectively trunc the file with appropriate locks + acquired. We don't use ctx->flags because self-heal + can use the same fd with different flags. */ + if (fop->uint32 != 0) { + ec_sleep(fop); + ec_ftruncate(fop->req_frame, fop->xl, cbk->mask, + fop->minimum, ec_open_truncate_cbk, fop, + cbk->fd, 0, NULL); + } + } + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.open != NULL) { + fop->cbks.open(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, cbk->fd, cbk->xdata); + } + + return EC_STATE_END; + + case -EC_STATE_INIT: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.open != NULL) { + fop->cbks.open(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL); + } + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_open(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_open_cbk_t func, void *data, loc_t *loc, + int32_t flags, fd_t *fd, dict_t *xdata) +{ + ec_cbk_t callback = {.open = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(OPEN) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_OPEN, EC_FLAG_LOCK_SHARED, + target, fop_flags, ec_wind_open, ec_manager_open, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->int32 = flags; + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL); + } +} + +/* FOP: readlink */ + +int32_t +ec_combine_readlink(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src) +{ + if (!ec_iatt_combine(fop, dst->iatt, src->iatt, 1)) { + gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_IATT_MISMATCH, + "Mismatching iatt in " + "answers of 'GF_FOP_READLINK'"); + + return 0; + } + + return 1; +} + +int32_t +ec_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, const char *path, + struct iatt *buf, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, fop->id, idx, op_ret, + op_errno); + if (cbk) { + if (xdata) + cbk->xdata = dict_ref(xdata); + + if (cbk->op_ret >= 0) { + cbk->iatt[0] = *buf; + cbk->str = gf_strdup(path); + if (!cbk->str) { + ec_cbk_set_error(cbk, ENOMEM, _gf_true); + } + } + ec_combine(cbk, NULL); + } + +out: + if (fop != NULL) + ec_complete(fop); + + return 0; +} + +void +ec_wind_readlink(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_readlink_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->readlink, + &fop->loc[0], fop->size, fop->xdata); +} + +int32_t +ec_manager_readlink(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk = NULL; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_LOCK: + ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0, + EC_RANGE_FULL); + ec_lock(fop); + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_one(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + if (ec_dispatch_one_retry(fop, &cbk)) { + return EC_STATE_DISPATCH; + } + + if ((cbk != NULL) && (cbk->op_ret >= 0)) { + ec_iatt_rebuild(fop->xl->private, &cbk->iatt[0], 1, 1); + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + GF_ASSERT(cbk); + if (fop->cbks.readlink != NULL) { + fop->cbks.readlink(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, cbk->str, &cbk->iatt[0], + cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + if (fop->cbks.readlink != NULL) { + fop->cbks.readlink(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL, NULL); + } + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_readlink(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_readlink_cbk_t func, void *data, loc_t *loc, + size_t size, dict_t *xdata) +{ + ec_cbk_t callback = {.readlink = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(READLINK) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate( + frame, this, GF_FOP_READLINK, EC_FLAG_LOCK_SHARED, target, fop_flags, + ec_wind_readlink, ec_manager_readlink, callback, data); + if (fop == NULL) { + goto out; + } + + fop->size = size; + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL); + } +} + +/* FOP: readv */ + +int32_t +ec_readv_rebuild(ec_t *ec, ec_fop_data_t *fop, ec_cbk_data_t *cbk) +{ + struct iovec vector[1]; + ec_cbk_data_t *ans = NULL; + struct iobref *iobref = NULL; + void *ptr; + uint64_t fsize = 0, size = 0, max = 0; + int32_t pos, err = -ENOMEM; + + if (cbk->op_ret < 0) { + err = -cbk->op_errno; + + goto out; + } + + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, &cbk->iatt[0].ia_size)); + + if (cbk->op_ret > 0) { + void *blocks[cbk->count]; + uint32_t values[cbk->count]; + + fsize = cbk->op_ret; + size = fsize * ec->fragments; + for (ans = cbk; ans != NULL; ans = ans->next) { + pos = gf_bits_count(cbk->mask & ((1 << ans->idx) - 1)); + values[pos] = ans->idx + 1; + blocks[pos] = ans->vector[0].iov_base; + if ((ans->int32 != 1) || + !EC_ALIGN_CHECK(blocks[pos], EC_METHOD_WORD_SIZE)) { + if (iobref == NULL) { + err = ec_buffer_alloc(ec->xl, size, &iobref, &ptr); + if (err != 0) { + goto out; + } + } + ec_iov_copy_to(ptr, ans->vector, ans->int32, 0, fsize); + blocks[pos] = ptr; + ptr += fsize; + } + } + + err = ec_buffer_alloc(ec->xl, size, &iobref, &ptr); + if (err != 0) { + goto out; + } + + err = ec_method_decode(&ec->matrix, fsize, cbk->mask, values, blocks, + ptr); + if (err != 0) { + goto out; + } + + vector[0].iov_base = ptr + fop->head; + vector[0].iov_len = size - fop->head; + + max = fop->offset * ec->fragments + size; + if (max > cbk->iatt[0].ia_size) { + max = cbk->iatt[0].ia_size; + } + max -= fop->offset * ec->fragments + fop->head; + if (max > fop->user_size) { + max = fop->user_size; + } + size -= fop->head; + if (size > max) { + vector[0].iov_len -= size - max; + size = max; + } + + cbk->op_ret = size; + cbk->int32 = 1; + + iobref_unref(cbk->buffers); + cbk->buffers = iobref; + + GF_FREE(cbk->vector); + cbk->vector = iov_dup(vector, 1); + if (cbk->vector == NULL) { + return -ENOMEM; + } + } + + return 0; + +out: + if (iobref != NULL) { + iobref_unref(iobref); + } + + return err; +} + +int32_t +ec_combine_readv(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src) +{ + if (!ec_vector_compare(dst->vector, dst->int32, src->vector, src->int32)) { + gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_VECTOR_MISMATCH, + "Mismatching vector in " + "answers of 'GF_FOP_READ'"); + + return 0; + } + + if (!ec_iatt_combine(fop, dst->iatt, src->iatt, 1)) { + gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_IATT_MISMATCH, + "Mismatching iatt in " + "answers of 'GF_FOP_READ'"); + + return 0; + } + + return 1; +} + +int32_t +ec_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + ec_t *ec = this->private; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_READ, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (op_ret >= 0) { + cbk->int32 = count; + + if (count > 0) { + cbk->vector = iov_dup(vector, count); + if (cbk->vector == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to duplicate a " + "vector list."); + + goto out; + } + cbk->int32 = count; + } + if (stbuf != NULL) { + cbk->iatt[0] = *stbuf; + } + if (iobref != NULL) { + cbk->buffers = iobref_ref(iobref); + if (cbk->buffers == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_BUF_REF_FAIL, + "Failed to reference a " + "buffer."); + + goto out; + } + } + } + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + if ((op_ret > 0) && ((op_ret % ec->fragment_size) != 0)) { + ec_cbk_set_error(cbk, EIO, _gf_true); + } + + ec_combine(cbk, ec_combine_readv); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_readv(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_readv_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->readv, fop->fd, + fop->size, fop->offset, fop->uint32, fop->xdata); +} + +int32_t +ec_manager_readv(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + ec_t *ec = fop->xl->private; + + switch (state) { + case EC_STATE_INIT: + fop->user_size = fop->size; + fop->head = ec_adjust_offset_down(fop->xl->private, &fop->offset, + _gf_true); + fop->size += fop->head; + ec_adjust_size_up(fop->xl->private, &fop->size, _gf_true); + + /* Fall through */ + + case EC_STATE_LOCK: + ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, fop->offset, + fop->size); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + if (ec->read_mask) { + fop->mask &= ec->read_mask; + } + ec_dispatch_min(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_true); + if (cbk != NULL) { + int32_t err; + + ec_iatt_rebuild(fop->xl->private, cbk->iatt, 1, cbk->count); + + err = ec_readv_rebuild(fop->xl->private, fop, cbk); + if (err != 0) { + ec_cbk_set_error(cbk, -err, _gf_true); + } + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.readv != NULL) { + fop->cbks.readv(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, cbk->vector, cbk->int32, + &cbk->iatt[0], cbk->buffers, cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.readv != NULL) { + fop->cbks.readv(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, 0, NULL, NULL, NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_readv(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_readv_cbk_t func, void *data, fd_t *fd, + size_t size, off_t offset, uint32_t flags, dict_t *xdata) +{ + ec_cbk_t callback = {.readv = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(READ) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_READ, EC_FLAG_LOCK_SHARED, + target, fop_flags, ec_wind_readv, + ec_manager_readv, callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + fop->size = size; + fop->offset = offset; + fop->uint32 = flags; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, 0, NULL, NULL, NULL); + } +} + +/* FOP: seek */ + +int32_t +ec_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, off_t offset, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + ec_t *ec = this->private; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_SEEK, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (op_ret >= 0) { + cbk->offset = offset; + } + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + } + + if ((op_ret > 0) && ((cbk->offset % ec->fragment_size) != 0)) { + cbk->op_ret = -1; + cbk->op_errno = EIO; + } + + ec_combine(cbk, NULL); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_seek(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_seek_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->seek, fop->fd, + fop->offset, fop->seek, fop->xdata); +} + +int32_t +ec_manager_seek(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + uint64_t size; + + switch (state) { + case EC_STATE_INIT: + fop->user_size = fop->offset; + fop->head = ec_adjust_offset_down(fop->xl->private, &fop->offset, + _gf_true); + + /* Fall through */ + + case EC_STATE_LOCK: + ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, fop->offset, + EC_RANGE_FULL); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT( + ec_get_inode_size(fop, fop->locks[0].lock->loc.inode, &size)); + + if (fop->user_size >= size) { + ec_fop_set_error(fop, ENXIO); + + return EC_STATE_REPORT; + } + + ec_dispatch_one(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + if (ec_dispatch_one_retry(fop, &cbk)) { + return EC_STATE_DISPATCH; + } + if ((cbk != NULL) && (cbk->op_ret >= 0)) { + ec_t *ec = fop->xl->private; + + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode, + &size)); + + cbk->offset *= ec->fragments; + if (cbk->offset < fop->user_size) { + cbk->offset = fop->user_size; + } + if (cbk->offset > size) { + cbk->offset = size; + } + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.seek != NULL) { + fop->cbks.seek(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, cbk->offset, cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.seek != NULL) { + fop->cbks.seek(fop->req_frame, fop, fop->xl, -1, fop->error, 0, + NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, 0, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_seek(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_seek_cbk_t func, void *data, fd_t *fd, + off_t offset, gf_seek_what_t what, dict_t *xdata) +{ + ec_cbk_t callback = {.seek = func}; + ec_fop_data_t *fop = NULL; + int32_t error = EIO; + + gf_msg_trace("ec", 0, "EC(SEEK) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_SEEK, EC_FLAG_LOCK_SHARED, + target, fop_flags, ec_wind_seek, ec_manager_seek, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + fop->offset = offset; + fop->seek = what; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, EIO, 0, NULL); + } +} + +/* FOP: stat */ + +int32_t +ec_combine_stat(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src) +{ + if (!ec_iatt_combine(fop, dst->iatt, src->iatt, 1)) { + gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_IATT_MISMATCH, + "Mismatching iatt in " + "answers of 'GF_FOP_STAT'"); + + return 0; + } + + return 1; +} + +int32_t +ec_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_STAT, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (op_ret >= 0) { + if (buf != NULL) { + cbk->iatt[0] = *buf; + } + } + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + ec_combine(cbk, ec_combine_stat); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_stat(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_stat_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->stat, + &fop->loc[0], fop->xdata); +} + +int32_t +ec_manager_stat(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_LOCK: + if (fop->fd == NULL) { + ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0, + EC_RANGE_FULL); + } else { + ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0, + EC_RANGE_FULL); + } + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_true); + + if (cbk != NULL) { + if (cbk->iatt[0].ia_type == IA_IFREG) { + ec_iatt_rebuild(fop->xl->private, cbk->iatt, 1, cbk->count); + + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, + fop->locks[0].lock->loc.inode, + &cbk->iatt[0].ia_size)); + } + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->id == GF_FOP_STAT) { + if (fop->cbks.stat != NULL) { + fop->cbks.stat(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, &cbk->iatt[0], cbk->xdata); + } + } else { + if (fop->cbks.fstat != NULL) { + fop->cbks.fstat(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, &cbk->iatt[0], cbk->xdata); + } + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->id == GF_FOP_STAT) { + if (fop->cbks.stat != NULL) { + fop->cbks.stat(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL); + } + } else { + if (fop->cbks.fstat != NULL) { + fop->cbks.fstat(fop->req_frame, fop, fop->xl, -1, + fop->error, NULL, NULL); + } + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_stat(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_stat_cbk_t func, void *data, loc_t *loc, + dict_t *xdata) +{ + ec_cbk_t callback = {.stat = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(STAT) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_STAT, EC_FLAG_LOCK_SHARED, + target, fop_flags, ec_wind_stat, ec_manager_stat, + callback, data); + if (fop == NULL) { + goto out; + } + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL); + } +} + +/* FOP: fstat */ + +int32_t +ec_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FSTAT, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (op_ret >= 0) { + if (buf != NULL) { + cbk->iatt[0] = *buf; + } + } + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + ec_combine(cbk, ec_combine_stat); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_fstat(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_fstat_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->fstat, fop->fd, + fop->xdata); +} + +void +ec_fstat(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fstat_cbk_t func, void *data, fd_t *fd, + dict_t *xdata) +{ + ec_cbk_t callback = {.fstat = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(FSTAT) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_FSTAT, EC_FLAG_LOCK_SHARED, + target, fop_flags, ec_wind_fstat, + ec_manager_stat, callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL); + } +} diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c new file mode 100644 index 00000000000..9b5fe2a7fdc --- /dev/null +++ b/xlators/cluster/ec/src/ec-inode-write.c @@ -0,0 +1,2369 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "ec-messages.h" +#include "ec-helpers.h" +#include "ec-common.h" +#include "ec-combine.h" +#include "ec-method.h" +#include "ec-fops.h" +#include "ec-mem-types.h" + +int32_t +ec_update_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + ec_cbk_data_t *cbk = NULL; + ec_fop_data_t *parent = fop->parent; + int i = 0; + + ec_trace("UPDATE_WRITEV_CBK", cookie, "ret=%d, errno=%d, parent-fop=%s", + op_ret, op_errno, ec_fop_name(parent->id)); + + if (op_ret < 0) { + ec_fop_set_error(parent, op_errno); + goto out; + } + cbk = ec_cbk_data_allocate(parent->frame, this, parent, parent->id, 0, + op_ret, op_errno); + if (!cbk) { + ec_fop_set_error(parent, ENOMEM); + goto out; + } + + if (xdata) + cbk->xdata = dict_ref(xdata); + + if (prebuf) + cbk->iatt[i++] = *prebuf; + + if (postbuf) + cbk->iatt[i++] = *postbuf; + + LOCK(&parent->lock); + { + parent->good &= fop->good; + + if (gf_bits_count(parent->good) < parent->minimum) { + __ec_fop_set_error(parent, EIO); + } else if (fop->error == 0 && parent->answer == NULL) { + parent->answer = cbk; + } + } + UNLOCK(&parent->lock); +out: + return 0; +} + +static int32_t +ec_update_write(ec_fop_data_t *fop, uintptr_t mask, off_t offset, uint64_t size) +{ + struct iobref *iobref = NULL; + struct iobuf *iobuf = NULL; + struct iovec vector; + int32_t err = -ENOMEM; + + iobref = iobref_new(); + if (iobref == NULL) { + goto out; + } + iobuf = iobuf_get(fop->xl->ctx->iobuf_pool); + if (iobuf == NULL) { + goto out; + } + err = iobref_add(iobref, iobuf); + if (err != 0) { + goto out; + } + + if (fop->locks[0].lock) + ec_lock_update_good(fop->locks[0].lock, fop); + vector.iov_base = iobuf->ptr; + vector.iov_len = size; + memset(vector.iov_base, 0, vector.iov_len); + + ec_writev(fop->frame, fop->xl, mask, fop->minimum, ec_update_writev_cbk, + NULL, fop->fd, &vector, 1, offset, 0, iobref, NULL); + + err = 0; + +out: + if (iobuf != NULL) { + iobuf_unref(iobuf); + } + if (iobref != NULL) { + iobref_unref(iobref); + } + + return err; +} + +int +ec_inode_write_cbk(call_frame_t *frame, xlator_t *this, void *cookie, + int op_ret, int op_errno, struct iatt *prestat, + struct iatt *poststat, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int i = 0; + int idx = 0; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + idx = (int32_t)(uintptr_t)cookie; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, fop->id, idx, op_ret, + op_errno); + if (!cbk) + goto out; + + if (op_ret < 0) + goto out; + + if (xdata) + cbk->xdata = dict_ref(xdata); + + if (prestat) + cbk->iatt[i++] = *prestat; + + if (poststat) + cbk->iatt[i++] = *poststat; + +out: + if (cbk) + ec_combine(cbk, ec_combine_write); + + if (fop) + ec_complete(fop); + return 0; +} +/* FOP: removexattr */ + +int32_t +ec_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, NULL, NULL, + xdata); +} + +void +ec_wind_removexattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_removexattr_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->removexattr, + &fop->loc[0], fop->str[0], fop->xdata); +} + +void +ec_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + switch (fop->id) { + case GF_FOP_SETXATTR: + if (fop->cbks.setxattr) { + QUORUM_CBK(fop->cbks.setxattr, fop, frame, cookie, this, op_ret, + op_errno, xdata); + } + break; + case GF_FOP_REMOVEXATTR: + if (fop->cbks.removexattr) { + QUORUM_CBK(fop->cbks.removexattr, fop, frame, cookie, this, + op_ret, op_errno, xdata); + } + break; + case GF_FOP_FSETXATTR: + if (fop->cbks.fsetxattr) { + QUORUM_CBK(fop->cbks.fsetxattr, fop, frame, cookie, this, + op_ret, op_errno, xdata); + } + break; + case GF_FOP_FREMOVEXATTR: + if (fop->cbks.fremovexattr) { + QUORUM_CBK(fop->cbks.fremovexattr, fop, frame, cookie, this, + op_ret, op_errno, xdata); + } + break; + } +} + +int32_t +ec_manager_xattr(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_LOCK: + if (fop->fd == NULL) { + ec_lock_prepare_inode(fop, &fop->loc[0], + EC_UPDATE_META | EC_QUERY_INFO, 0, + EC_RANGE_FULL); + } else { + ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META | EC_QUERY_INFO, + 0, EC_RANGE_FULL); + } + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + ec_fop_prepare_answer(fop, _gf_false); + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + ec_xattr_cbk(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, cbk->xdata); + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + ec_xattr_cbk(fop->req_frame, fop, fop->xl, -1, fop->error, NULL); + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_removexattr(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_removexattr_cbk_t func, void *data, + loc_t *loc, const char *name, dict_t *xdata) +{ + ec_cbk_t callback = {.removexattr = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(REMOVEXATTR) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_REMOVEXATTR, 0, target, + fop_flags, ec_wind_removexattr, ec_manager_xattr, + callback, data); + if (fop == NULL) { + goto out; + } + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (name != NULL) { + fop->str[0] = gf_strdup(name); + if (fop->str[0] == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to duplicate a string."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL); + } +} + +/* FOP: fremovexattr */ + +int32_t +ec_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, NULL, NULL, + xdata); +} + +void +ec_wind_fremovexattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_fremovexattr_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->fremovexattr, + fop->fd, fop->str[0], fop->xdata); +} + +void +ec_fremovexattr(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fremovexattr_cbk_t func, void *data, + fd_t *fd, const char *name, dict_t *xdata) +{ + ec_cbk_t callback = {.fremovexattr = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(FREMOVEXATTR) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_FREMOVEXATTR, 0, target, + fop_flags, ec_wind_fremovexattr, + ec_manager_xattr, callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (name != NULL) { + fop->str[0] = gf_strdup(name); + if (fop->str[0] == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to duplicate a string."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL); + } +} + +/* FOP: setattr */ + +int32_t +ec_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prestat, + struct iatt *poststat, dict_t *xdata) +{ + return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat, + poststat, xdata); +} + +void +ec_wind_setattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_setattr_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->setattr, + &fop->loc[0], &fop->iatt, fop->int32, fop->xdata); +} + +int32_t +ec_manager_setattr(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_LOCK: + if (fop->fd == NULL) { + ec_lock_prepare_inode(fop, &fop->loc[0], + EC_UPDATE_META | EC_QUERY_INFO, 0, + EC_RANGE_FULL); + } else { + ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META | EC_QUERY_INFO, + 0, EC_RANGE_FULL); + } + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_false); + if (cbk != NULL) { + if (cbk->iatt[0].ia_type == IA_IFREG) { + ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); + + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, + fop->locks[0].lock->loc.inode, + &cbk->iatt[0].ia_size)); + cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; + } + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->id == GF_FOP_SETATTR) { + if (fop->cbks.setattr != NULL) { + QUORUM_CBK(fop->cbks.setattr, fop, fop->req_frame, fop, + fop->xl, cbk->op_ret, cbk->op_errno, + &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); + } + } else { + if (fop->cbks.fsetattr != NULL) { + QUORUM_CBK(fop->cbks.fsetattr, fop, fop->req_frame, fop, + fop->xl, cbk->op_ret, cbk->op_errno, + &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); + } + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->id == GF_FOP_SETATTR) { + if (fop->cbks.setattr != NULL) { + fop->cbks.setattr(fop->req_frame, fop, fop->xl, -1, + fop->error, NULL, NULL, NULL); + } + } else { + if (fop->cbks.fsetattr != NULL) { + fop->cbks.fsetattr(fop->req_frame, fop, fop->xl, -1, + fop->error, NULL, NULL, NULL); + } + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_setattr(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_setattr_cbk_t func, void *data, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + ec_cbk_t callback = {.setattr = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(SETATTR) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_SETATTR, 0, target, + fop_flags, ec_wind_setattr, ec_manager_setattr, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->int32 = valid; + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (stbuf != NULL) { + fop->iatt = *stbuf; + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL); + } +} + +/* FOP: fsetattr */ + +int32_t +ec_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prestat, + struct iatt *poststat, dict_t *xdata) +{ + return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat, + poststat, xdata); +} + +void +ec_wind_fsetattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_fsetattr_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->fsetattr, + fop->fd, &fop->iatt, fop->int32, fop->xdata); +} + +void +ec_fsetattr(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fsetattr_cbk_t func, void *data, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + ec_cbk_t callback = {.fsetattr = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(FSETATTR) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_FSETATTR, 0, target, + fop_flags, ec_wind_fsetattr, ec_manager_setattr, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + fop->int32 = valid; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (stbuf != NULL) { + fop->iatt = *stbuf; + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL); + } +} + +/* FOP: setxattr */ + +int32_t +ec_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, NULL, NULL, + xdata); +} + +void +ec_wind_setxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_setxattr_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->setxattr, + &fop->loc[0], fop->dict, fop->int32, fop->xdata); +} + +void +ec_setxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_setxattr_cbk_t func, void *data, loc_t *loc, + dict_t *dict, int32_t flags, dict_t *xdata) +{ + ec_cbk_t callback = {.setxattr = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(SETXATTR) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_SETXATTR, 0, target, + fop_flags, ec_wind_setxattr, ec_manager_xattr, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->int32 = flags; + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (dict != NULL) { + fop->dict = dict_copy_with_ref(dict, NULL); + if (fop->dict == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL); + } +} + +/* FOP: fsetxattr */ + +int32_t +ec_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FSETXATTR, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + ec_combine(cbk, NULL); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_fsetxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_fsetxattr_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->fsetxattr, + fop->fd, fop->dict, fop->int32, fop->xdata); +} + +void +ec_fsetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fsetxattr_cbk_t func, void *data, fd_t *fd, + dict_t *dict, int32_t flags, dict_t *xdata) +{ + ec_cbk_t callback = {.fsetxattr = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(FSETXATTR) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_FSETXATTR, 0, target, + fop_flags, ec_wind_fsetxattr, ec_manager_xattr, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + fop->int32 = flags; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (dict != NULL) { + fop->dict = dict_copy_with_ref(dict, NULL); + if (fop->dict == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL); + } +} + +/********************************************************************* + * + * File Operation : fallocate + * + *********************************************************************/ + +int32_t +ec_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prebuf, + postbuf, xdata); +} + +void +ec_wind_fallocate(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_fallocate_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->fallocate, + fop->fd, fop->int32, fop->offset, fop->size, fop->xdata); +} + +int32_t +ec_manager_fallocate(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk = NULL; + + switch (state) { + case EC_STATE_INIT: + if (fop->size == 0) { + ec_fop_set_error(fop, EINVAL); + return EC_STATE_REPORT; + } + if (fop->int32 & + (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | + FALLOC_FL_ZERO_RANGE | FALLOC_FL_PUNCH_HOLE)) { + ec_fop_set_error(fop, ENOTSUP); + return EC_STATE_REPORT; + } + fop->user_size = fop->offset + fop->size; + fop->head = ec_adjust_offset_down(fop->xl->private, &fop->offset, + _gf_true); + fop->size += fop->head; + ec_adjust_size_up(fop->xl->private, &fop->size, _gf_true); + + /* Fall through */ + + case EC_STATE_LOCK: + ec_lock_prepare_fd(fop, fop->fd, + EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO, + fop->offset, fop->size); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_false); + if (cbk != NULL) { + ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); + + /* This shouldn't fail because we have the inode locked. */ + LOCK(&fop->locks[0].lock->loc.inode->lock); + { + GF_ASSERT(__ec_get_inode_size(fop, + fop->locks[0].lock->loc.inode, + &cbk->iatt[0].ia_size)); + + /*If mode has FALLOC_FL_KEEP_SIZE keep the size */ + if (fop->int32 & FALLOC_FL_KEEP_SIZE) { + cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; + } else if (fop->user_size > cbk->iatt[0].ia_size) { + cbk->iatt[1].ia_size = fop->user_size; + + /* This shouldn't fail because we have the inode + * locked. */ + GF_ASSERT(__ec_set_inode_size( + fop, fop->locks[0].lock->loc.inode, + cbk->iatt[1].ia_size)); + } else { + cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; + } + } + UNLOCK(&fop->locks[0].lock->loc.inode->lock); + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.fallocate != NULL) { + QUORUM_CBK(fop->cbks.fallocate, fop, fop->req_frame, fop, + fop->xl, cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.fallocate != NULL) { + fop->cbks.fallocate(fop->req_frame, fop, fop->xl, -1, + fop->error, NULL, NULL, NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fallocate_cbk_t func, void *data, fd_t *fd, + int32_t mode, off_t offset, size_t len, dict_t *xdata) +{ + ec_cbk_t callback = {.fallocate = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(FALLOCATE) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_FALLOCATE, 0, target, + fop_flags, ec_wind_fallocate, + ec_manager_fallocate, callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + fop->int32 = mode; + fop->offset = offset; + fop->size = len; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + goto out; + } + } + + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL); + } +} + +/********************************************************************* + * + * File Operation : Discard + * + *********************************************************************/ +void +ec_update_discard_write(ec_fop_data_t *fop, uintptr_t mask) +{ + ec_t *ec = fop->xl->private; + off_t off_head = 0; + off_t off_tail = 0; + uint64_t size_head = 0; + uint64_t size_tail = 0; + int error = 0; + + off_head = fop->offset * ec->fragments - fop->int32; + if (fop->size == 0) { + error = ec_update_write(fop, mask, off_head, fop->user_size); + } else { + size_head = fop->int32; + size_tail = (off_head + fop->user_size) % ec->stripe_size; + off_tail = off_head + fop->user_size - size_tail; + if (size_head) { + error = ec_update_write(fop, mask, off_head, size_head); + if (error) { + goto out; + } + } + if (size_tail) { + error = ec_update_write(fop, mask, off_tail, size_tail); + } + } +out: + if (error) + ec_fop_set_error(fop, -error); +} + +void +ec_discard_adjust_offset_size(ec_fop_data_t *fop) +{ + ec_t *ec = fop->xl->private; + + fop->user_size = fop->size; + /* If discard length covers at least a fragment on brick, we will + * perform discard operation(when fop->size is non-zero) else we just + * write zeros. + */ + fop->int32 = ec_adjust_offset_up(ec, &fop->offset, _gf_true); + fop->frag_range.first = fop->offset; + if (fop->size < fop->int32) { + fop->size = 0; + } else { + fop->size -= fop->int32; + ec_adjust_size_down(ec, &fop->size, _gf_true); + } + fop->frag_range.last = fop->offset + fop->size; +} + +int32_t +ec_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prebuf, + postbuf, xdata); +} + +void +ec_wind_discard(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_discard_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->discard, + fop->fd, fop->offset, fop->size, fop->xdata); +} + +int32_t +ec_manager_discard(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk = NULL; + off_t fl_start = 0; + uint64_t fl_size = 0; + + switch (state) { + case EC_STATE_INIT: + if ((fop->size <= 0) || (fop->offset < 0)) { + ec_fop_set_error(fop, EINVAL); + return EC_STATE_REPORT; + } + /* Because of the head/tail writes, "discard" happens on the + * remaining regions, but we need to compute region including + * head/tail writes so compute them separately*/ + fl_start = fop->offset; + fl_size = fop->size; + fl_size += ec_adjust_offset_down(fop->xl->private, &fl_start, + _gf_true); + ec_adjust_size_up(fop->xl->private, &fl_size, _gf_true); + + ec_discard_adjust_offset_size(fop); + + /* Fall through */ + + case EC_STATE_LOCK: + ec_lock_prepare_fd(fop, fop->fd, + EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO, + fl_start, fl_size); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + + /* Dispatch discard fop only if we have whole fragment + * to deallocate */ + if (fop->size) { + ec_dispatch_all(fop); + return EC_STATE_DELAYED_START; + } else { + /* Assume discard to have succeeded on all bricks */ + ec_succeed_all(fop); + } + + /* Fall through */ + + case EC_STATE_DELAYED_START: + + if (fop->size) { + if (fop->answer && fop->answer->op_ret == 0) + ec_update_discard_write(fop, fop->answer->mask); + } else { + ec_update_discard_write(fop, fop->mask); + } + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_false); + if (cbk != NULL) { + ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); + + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode, + &cbk->iatt[0].ia_size)); + + cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; + } + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.discard != NULL) { + QUORUM_CBK(fop->cbks.discard, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_DELAYED_START: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.discard != NULL) { + fop->cbks.discard(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL, NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_discard(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_discard_cbk_t func, void *data, fd_t *fd, + off_t offset, size_t len, dict_t *xdata) +{ + ec_cbk_t callback = {.discard = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(DISCARD) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_DISCARD, 0, target, + fop_flags, ec_wind_discard, ec_manager_discard, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + fop->offset = offset; + fop->size = len; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + } + + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL); + } +} + +/********************************************************************* + * + * File Operation : truncate + * + *********************************************************************/ + +int32_t +ec_update_truncate_write(ec_fop_data_t *fop, uintptr_t mask) +{ + ec_t *ec = fop->xl->private; + uint64_t size = fop->offset * ec->fragments - fop->user_size; + return ec_update_write(fop, mask, fop->user_size, size); +} + +int32_t +ec_truncate_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + int32_t err; + + fop->parent->good &= fop->good; + if (op_ret >= 0) { + fd_bind(fd); + err = ec_update_truncate_write(fop->parent, fop->answer->mask); + if (err != 0) { + ec_fop_set_error(fop->parent, -err); + } + } + + return 0; +} + +int32_t +ec_truncate_clean(ec_fop_data_t *fop) +{ + if (fop->fd == NULL) { + fop->fd = fd_create(fop->loc[0].inode, fop->frame->root->pid); + if (fop->fd == NULL) { + return -ENOMEM; + } + + ec_open(fop->frame, fop->xl, fop->answer->mask, fop->minimum, + ec_truncate_open_cbk, fop, &fop->loc[0], O_RDWR, fop->fd, NULL); + + return 0; + } else { + return ec_update_truncate_write(fop, fop->answer->mask); + } +} + +int32_t +ec_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prestat, + struct iatt *poststat, dict_t *xdata) +{ + return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat, + poststat, xdata); +} + +void +ec_wind_truncate(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_truncate_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->truncate, + &fop->loc[0], fop->offset, fop->xdata); +} + +int32_t +ec_manager_truncate(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + off_t offset_down; + + switch (state) { + case EC_STATE_INIT: + fop->user_size = fop->offset; + ec_adjust_offset_up(fop->xl->private, &fop->offset, _gf_true); + fop->frag_range.first = fop->offset; + fop->frag_range.last = UINT64_MAX; + + /* Fall through */ + + case EC_STATE_LOCK: + offset_down = fop->user_size; + ec_adjust_offset_down(fop->xl->private, &offset_down, _gf_true); + + if (fop->id == GF_FOP_TRUNCATE) { + ec_lock_prepare_inode( + fop, &fop->loc[0], + EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO, + offset_down, EC_RANGE_FULL); + } else { + ec_lock_prepare_fd( + fop, fop->fd, + EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO, + offset_down, EC_RANGE_FULL); + } + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_false); + if (cbk != NULL) { + int32_t err; + + ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); + + /* This shouldn't fail because we have the inode locked. */ + /* Inode size doesn't need to be updated under locks, because + * conflicting operations won't be in-flight + */ + GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode, + &cbk->iatt[0].ia_size)); + cbk->iatt[1].ia_size = fop->user_size; + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_set_inode_size(fop, fop->locks[0].lock->loc.inode, + fop->user_size)); + if ((cbk->iatt[0].ia_size > cbk->iatt[1].ia_size) && + (fop->user_size != fop->offset)) { + err = ec_truncate_clean(fop); + if (err != 0) { + ec_cbk_set_error(cbk, -err, _gf_false); + } + } + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->id == GF_FOP_TRUNCATE) { + if (fop->cbks.truncate != NULL) { + QUORUM_CBK(fop->cbks.truncate, fop, fop->req_frame, fop, + fop->xl, cbk->op_ret, cbk->op_errno, + &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); + } + } else { + if (fop->cbks.ftruncate != NULL) { + QUORUM_CBK(fop->cbks.ftruncate, fop, fop->req_frame, fop, + fop->xl, cbk->op_ret, cbk->op_errno, + &cbk->iatt[0], &cbk->iatt[1], cbk->xdata); + } + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->id == GF_FOP_TRUNCATE) { + if (fop->cbks.truncate != NULL) { + fop->cbks.truncate(fop->req_frame, fop, fop->xl, -1, + fop->error, NULL, NULL, NULL); + } + } else { + if (fop->cbks.ftruncate != NULL) { + fop->cbks.ftruncate(fop->req_frame, fop, fop->xl, -1, + fop->error, NULL, NULL, NULL); + } + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_truncate(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_truncate_cbk_t func, void *data, loc_t *loc, + off_t offset, dict_t *xdata) +{ + ec_cbk_t callback = {.truncate = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(TRUNCATE) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_TRUNCATE, 0, target, + fop_flags, ec_wind_truncate, ec_manager_truncate, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->offset = offset; + + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL); + } +} + +/* FOP: ftruncate */ + +int32_t +ec_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prestat, + struct iatt *poststat, dict_t *xdata) +{ + return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat, + poststat, xdata); +} + +void +ec_wind_ftruncate(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_ftruncate_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->ftruncate, + fop->fd, fop->offset, fop->xdata); +} + +void +ec_ftruncate(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_ftruncate_cbk_t func, void *data, fd_t *fd, + off_t offset, dict_t *xdata) +{ + ec_cbk_t callback = {.ftruncate = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(FTRUNCATE) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_FTRUNCATE, 0, target, + fop_flags, ec_wind_ftruncate, + ec_manager_truncate, callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + fop->offset = offset; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL); + } +} + +/* FOP: writev */ +static ec_stripe_t * +ec_allocate_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache) +{ + ec_stripe_t *stripe = NULL; + + if (stripe_cache->count >= stripe_cache->max) { + GF_ASSERT(!list_empty(&stripe_cache->lru)); + stripe = list_first_entry(&stripe_cache->lru, ec_stripe_t, lru); + list_move_tail(&stripe->lru, &stripe_cache->lru); + GF_ATOMIC_INC(ec->stats.stripe_cache.evicts); + } else { + stripe = GF_MALLOC(sizeof(ec_stripe_t) + ec->stripe_size, + ec_mt_ec_stripe_t); + if (stripe != NULL) { + stripe_cache->count++; + list_add_tail(&stripe->lru, &stripe_cache->lru); + GF_ATOMIC_INC(ec->stats.stripe_cache.allocs); + } else { + GF_ATOMIC_INC(ec->stats.stripe_cache.errors); + } + } + + return stripe; +} + +static void +ec_write_stripe_data(ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe) +{ + off_t base; + + base = fop->size - ec->stripe_size; + memcpy(stripe->data, fop->vector[0].iov_base + base, ec->stripe_size); + stripe->frag_offset = fop->frag_range.last - ec->fragment_size; +} + +static void +ec_add_stripe_in_cache(ec_t *ec, ec_fop_data_t *fop) +{ + ec_inode_t *ctx = NULL; + ec_stripe_t *stripe = NULL; + ec_stripe_list_t *stripe_cache = NULL; + gf_boolean_t failed = _gf_true; + + LOCK(&fop->fd->inode->lock); + + ctx = __ec_inode_get(fop->fd->inode, fop->xl); + if (ctx == NULL) { + goto out; + } + + stripe_cache = &ctx->stripe_cache; + if (stripe_cache->max > 0) { + stripe = ec_allocate_stripe(ec, stripe_cache); + if (stripe == NULL) { + goto out; + } + + ec_write_stripe_data(ec, fop, stripe); + } + + failed = _gf_false; + +out: + UNLOCK(&fop->fd->inode->lock); + + if (failed) { + gf_msg(ec->xl->name, GF_LOG_DEBUG, ENOMEM, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to create and add stripe in cache"); + } +} + +int32_t +ec_writev_merge_tail(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) +{ + ec_t *ec = this->private; + ec_fop_data_t *fop = frame->local; + uint64_t size, base, tmp; + + if (op_ret >= 0) { + tmp = 0; + size = fop->size - fop->user_size - fop->head; + base = ec->stripe_size - size; + if (op_ret > base) { + tmp = min(op_ret - base, size); + ec_iov_copy_to(fop->vector[0].iov_base + fop->size - size, vector, + count, base, tmp); + + size -= tmp; + } + + if (size > 0) { + memset(fop->vector[0].iov_base + fop->size - size, 0, size); + } + + if (ec->stripe_cache) { + ec_add_stripe_in_cache(ec, fop); + } + } + return 0; +} + +int32_t +ec_writev_merge_head(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) +{ + ec_t *ec = this->private; + ec_fop_data_t *fop = frame->local; + uint64_t size, base; + + if (op_ret >= 0) { + size = fop->head; + base = 0; + + if (op_ret > 0) { + base = min(op_ret, size); + ec_iov_copy_to(fop->vector[0].iov_base, vector, count, 0, base); + + size -= base; + } + + if (size > 0) { + memset(fop->vector[0].iov_base + base, 0, size); + } + + size = fop->size - fop->user_size - fop->head; + if ((size > 0) && (fop->size == ec->stripe_size)) { + ec_writev_merge_tail(frame, cookie, this, op_ret, op_errno, vector, + count, stbuf, iobref, xdata); + } + } + + return 0; +} + +static int +ec_make_internal_fop_xdata(dict_t **xdata) +{ + dict_t *dict = NULL; + + if (*xdata) + return 0; + + dict = dict_new(); + if (!dict) + goto out; + + if (dict_set_str(dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes")) + goto out; + + *xdata = dict; + return 0; +out: + if (dict) + dict_unref(dict); + return -1; +} + +static int32_t +ec_writev_prepare_buffers(ec_t *ec, ec_fop_data_t *fop) +{ + struct iobref *iobref = NULL; + struct iovec *iov; + void *ptr; + int32_t err; + + fop->user_size = iov_length(fop->vector, fop->int32); + fop->head = ec_adjust_offset_down(ec, &fop->offset, _gf_false); + fop->frag_range.first = fop->offset / ec->fragments; + fop->size = fop->user_size + fop->head; + ec_adjust_size_up(ec, &fop->size, _gf_false); + fop->frag_range.last = fop->frag_range.first + fop->size / ec->fragments; + + if ((fop->int32 != 1) || (fop->head != 0) || (fop->size > fop->user_size) || + !EC_ALIGN_CHECK(fop->vector[0].iov_base, EC_METHOD_WORD_SIZE)) { + err = ec_buffer_alloc(ec->xl, fop->size, &iobref, &ptr); + if (err != 0) { + goto out; + } + + ec_iov_copy_to(ptr + fop->head, fop->vector, fop->int32, 0, + fop->user_size); + + fop->vector[0].iov_base = ptr; + fop->vector[0].iov_len = fop->size; + + iobref_unref(fop->buffers); + fop->buffers = iobref; + } + + if (fop->int32 != 2) { + iov = GF_MALLOC(VECTORSIZE(2), gf_common_mt_iovec); + if (iov == NULL) { + err = -ENOMEM; + + goto out; + } + iov[0].iov_base = fop->vector[0].iov_base; + iov[0].iov_len = fop->vector[0].iov_len; + + GF_FREE(fop->vector); + fop->vector = iov; + } + + fop->vector[1].iov_len = fop->size / ec->fragments; + err = ec_buffer_alloc(ec->xl, fop->vector[1].iov_len * ec->nodes, + &fop->buffers, &fop->vector[1].iov_base); + if (err != 0) { + goto out; + } + + err = 0; + +out: + return err; +} + +static void +ec_merge_stripe_head_locked(ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe) +{ + uint32_t head, size; + + head = fop->head; + memcpy(fop->vector[0].iov_base, stripe->data, head); + + size = ec->stripe_size - head; + if (size > fop->user_size) { + head += fop->user_size; + size = ec->stripe_size - head; + memcpy(fop->vector[0].iov_base + head, stripe->data + head, size); + } +} + +static void +ec_merge_stripe_tail_locked(ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe) +{ + uint32_t head, tail; + off_t offset; + + offset = fop->user_size + fop->head; + tail = fop->size - offset; + head = ec->stripe_size - tail; + + memcpy(fop->vector[0].iov_base + offset, stripe->data + head, tail); +} + +static ec_stripe_t * +ec_get_stripe_from_cache_locked(ec_t *ec, ec_fop_data_t *fop, + uint64_t frag_offset) +{ + ec_inode_t *ctx = NULL; + ec_stripe_t *stripe = NULL; + ec_stripe_list_t *stripe_cache = NULL; + + ctx = __ec_inode_get(fop->fd->inode, fop->xl); + if (ctx == NULL) { + GF_ATOMIC_INC(ec->stats.stripe_cache.errors); + return NULL; + } + + stripe_cache = &ctx->stripe_cache; + list_for_each_entry(stripe, &stripe_cache->lru, lru) + { + if (stripe->frag_offset == frag_offset) { + list_move_tail(&stripe->lru, &stripe_cache->lru); + GF_ATOMIC_INC(ec->stats.stripe_cache.hits); + return stripe; + } + } + + GF_ATOMIC_INC(ec->stats.stripe_cache.misses); + + return NULL; +} + +static gf_boolean_t +ec_get_and_merge_stripe(ec_t *ec, ec_fop_data_t *fop, ec_stripe_part_t which) +{ + uint64_t frag_offset; + ec_stripe_t *stripe = NULL; + gf_boolean_t found = _gf_false; + + if (!ec->stripe_cache) { + return found; + } + + LOCK(&fop->fd->inode->lock); + if (which == EC_STRIPE_HEAD) { + frag_offset = fop->frag_range.first; + stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset); + if (stripe) { + ec_merge_stripe_head_locked(ec, fop, stripe); + found = _gf_true; + } + } + + if (which == EC_STRIPE_TAIL) { + frag_offset = fop->frag_range.last - ec->fragment_size; + stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset); + if (stripe) { + ec_merge_stripe_tail_locked(ec, fop, stripe); + found = _gf_true; + } + } + UNLOCK(&fop->fd->inode->lock); + + return found; +} + +static uintptr_t +ec_get_lock_good_mask(inode_t *inode, xlator_t *xl) +{ + ec_lock_t *lock = NULL; + ec_inode_t *ictx = NULL; + LOCK(&inode->lock); + { + ictx = __ec_inode_get(inode, xl); + if (ictx) + lock = ictx->inode_lock; + } + UNLOCK(&inode->lock); + if (lock) + return lock->good_mask; + return 0; +} + +void +ec_writev_start(ec_fop_data_t *fop) +{ + ec_t *ec = fop->xl->private; + ec_fd_t *ctx; + fd_t *fd; + dict_t *xdata = NULL; + uint64_t tail, current; + int32_t err = -ENOMEM; + gf_boolean_t found_stripe = _gf_false; + + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, ¤t)); + + fd = fd_anonymous(fop->fd->inode); + if (fd == NULL) { + goto failed; + } + + fop->frame->root->uid = 0; + fop->frame->root->gid = 0; + + ctx = ec_fd_get(fop->fd, fop->xl); + if (ctx != NULL) { + if ((ctx->flags & O_APPEND) != 0) { + /* Appending writes take full locks so size won't change because + * of any parallel operations + */ + fop->offset = current; + } + } + + err = ec_writev_prepare_buffers(ec, fop); + if (err != 0) { + goto failed_fd; + } + tail = fop->size - fop->user_size - fop->head; + if (fop->head > 0) { + if (current > fop->offset) { + found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD); + if (!found_stripe) { + if (ec_make_internal_fop_xdata(&xdata)) { + err = -ENOMEM; + goto failed_xdata; + } + ec_readv(fop->frame, fop->xl, + ec_get_lock_good_mask(fop->fd->inode, fop->xl), + EC_MINIMUM_MIN, ec_writev_merge_head, NULL, fd, + ec->stripe_size, fop->offset, 0, xdata); + } + } else { + memset(fop->vector[0].iov_base, 0, fop->head); + memset(fop->vector[0].iov_base + fop->size - tail, 0, tail); + if (ec->stripe_cache && (fop->size <= ec->stripe_size)) { + ec_add_stripe_in_cache(ec, fop); + } + } + } + + if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) { + /* Current locking scheme will make sure the 'current' below will + * never decrease while the fop is in progress, so the checks will + * work as expected + */ + if (current > fop->offset + fop->head + fop->user_size) { + found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_TAIL); + if (!found_stripe) { + if (ec_make_internal_fop_xdata(&xdata)) { + err = -ENOMEM; + goto failed_xdata; + } + ec_readv(fop->frame, fop->xl, + ec_get_lock_good_mask(fop->fd->inode, fop->xl), + EC_MINIMUM_MIN, ec_writev_merge_tail, NULL, fd, + ec->stripe_size, + fop->offset + fop->size - ec->stripe_size, 0, xdata); + } + } else { + memset(fop->vector[0].iov_base + fop->size - tail, 0, tail); + if (ec->stripe_cache) { + ec_add_stripe_in_cache(ec, fop); + } + } + } + + err = 0; + +failed_xdata: + if (xdata) { + dict_unref(xdata); + } +failed_fd: + fd_unref(fd); +failed: + ec_fop_set_error(fop, -err); +} + +int32_t +ec_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prestat, struct iatt *poststat, + dict_t *xdata) +{ + ec_t *ec = NULL; + if (this && this->private) { + ec = this->private; + if ((op_ret > 0) && ((op_ret % ec->fragment_size) != 0)) { + op_ret = -1; + op_errno = EIO; + } + } + return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat, + poststat, xdata); +} + +void +ec_wind_writev(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + struct iovec vector[1]; + size_t size; + + size = fop->vector[1].iov_len; + + vector[0].iov_base = fop->vector[1].iov_base + idx * size; + vector[0].iov_len = size; + + STACK_WIND_COOKIE(fop->frame, ec_writev_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->writev, fop->fd, + vector, 1, fop->offset / ec->fragments, fop->uint32, + fop->buffers, fop->xdata); +} + +static void +ec_writev_encode(ec_fop_data_t *fop) +{ + ec_t *ec = fop->xl->private; + void *blocks[ec->nodes]; + uint32_t i; + + blocks[0] = fop->vector[1].iov_base; + for (i = 1; i < ec->nodes; i++) { + blocks[i] = blocks[i - 1] + fop->vector[1].iov_len; + } + ec_method_encode(&ec->matrix, fop->vector[0].iov_len, + fop->vector[0].iov_base, blocks); +} + +int32_t +ec_manager_writev(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + ec_fd_t *ctx = NULL; + ec_t *ec = fop->xl->private; + off_t fl_start = 0; + uint64_t fl_size = LONG_MAX; + + switch (state) { + case EC_STATE_INIT: + case EC_STATE_LOCK: + ctx = ec_fd_get(fop->fd, fop->xl); + if (ctx != NULL) { + if ((ctx->flags & O_APPEND) == 0) { + off_t user_size = 0; + off_t head = 0; + + fl_start = fop->offset; + user_size = iov_length(fop->vector, fop->int32); + head = ec_adjust_offset_down(ec, &fl_start, _gf_true); + fl_size = user_size + head; + ec_adjust_size_up(ec, &fl_size, _gf_true); + } + } + ec_lock_prepare_fd(fop, fop->fd, + EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO, + fl_start, fl_size); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + ec_writev_start(fop); + + return EC_STATE_DELAYED_START; + + case EC_STATE_DELAYED_START: + /* Restore uid, gid if they were changed to do some partial + * reads. */ + fop->frame->root->uid = fop->uid; + fop->frame->root->gid = fop->gid; + + ec_writev_encode(fop); + + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_false); + if (cbk != NULL) { + ec_t *ec = fop->xl->private; + uint64_t size; + + ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); + + /* This shouldn't fail because we have the inode locked. */ + LOCK(&fop->fd->inode->lock); + { + GF_ASSERT(__ec_get_inode_size(fop, fop->fd->inode, + &cbk->iatt[0].ia_size)); + cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; + size = fop->offset + fop->head + fop->user_size; + if (size > cbk->iatt[0].ia_size) { + /* Only update inode size if this is a top level fop. + * Otherwise this is an internal write and the top + * level fop should take care of the real inode size. + */ + if (fop->parent == NULL) { + /* This shouldn't fail because we have the inode + * locked. */ + GF_ASSERT( + __ec_set_inode_size(fop, fop->fd->inode, size)); + } + cbk->iatt[1].ia_size = size; + } + } + UNLOCK(&fop->fd->inode->lock); + + if (fop->error == 0) { + cbk->op_ret *= ec->fragments; + if (cbk->op_ret < fop->head) { + cbk->op_ret = 0; + } else { + cbk->op_ret -= fop->head; + } + if (cbk->op_ret > fop->user_size) { + cbk->op_ret = fop->user_size; + } + } + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.writev != NULL) { + QUORUM_CBK(fop->cbks.writev, fop, fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, &cbk->iatt[0], + &cbk->iatt[1], cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_DELAYED_START: + /* We have failed while doing partial reads. We need to restore + * original uid, gid. */ + fop->frame->root->uid = fop->uid; + fop->frame->root->gid = fop->gid; + + /* Fall through */ + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.writev != NULL) { + fop->cbks.writev(fop->req_frame, fop, fop->xl, -1, fop->error, + NULL, NULL, NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_writev(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_writev_cbk_t func, void *data, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + ec_cbk_t callback = {.writev = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(WRITE) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_WRITE, 0, target, fop_flags, + ec_wind_writev, ec_manager_writev, callback, + data); + if (fop == NULL) { + goto out; + } + + fop->int32 = count; + fop->offset = offset; + fop->uint32 = flags; + + fop->use_fd = 1; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (count > 0) { + fop->vector = iov_dup(vector, count); + if (fop->vector == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to duplicate a " + "vector list."); + + goto out; + } + fop->int32 = count; + } + if (iobref != NULL) { + fop->buffers = iobref_ref(iobref); + if (fop->buffers == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_BUF_REF_FAIL, + "Failed to reference a " + "buffer."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_copy_with_ref(xdata, NULL); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL); + } +} diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c new file mode 100644 index 00000000000..601960d6154 --- /dev/null +++ b/xlators/cluster/ec/src/ec-locks.c @@ -0,0 +1,1128 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "ec-helpers.h" +#include "ec-common.h" +#include "ec-combine.h" +#include "ec-fops.h" +#include "ec-messages.h" + +#define EC_LOCK_MODE_NONE 0 +#define EC_LOCK_MODE_INC 1 +#define EC_LOCK_MODE_ALL 2 + +int32_t +ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask) +{ + ec_t *ec = fop->xl->private; + ec_cbk_data_t *ans = NULL; + ec_cbk_data_t *cbk = NULL; + uintptr_t locked = 0; + int32_t good = 0; + int32_t eagain = 0; + int32_t estale = 0; + int32_t error = -1; + + /* There are some errors that we'll handle in an special way while trying + * to acquire a lock. + * + * EAGAIN: If it's found during a parallel non-blocking lock request, we + * consider that there's contention on the inode, so we consider + * the acquisition a failure and try again with a sequential + * blocking lock request. This will ensure that we get a lock on + * as many bricks as possible (ignoring EAGAIN here would cause + * unnecessary triggers of self-healing). + * + * If it's found during a sequential blocking lock request, it's + * considered an error. Lock will only succeed if there are + * enough other bricks locked. + * + * ESTALE: This can appear during parallel or sequential lock request if + * the inode has just been unlinked. We consider this error is + * not recoverable, but we also don't consider it as fatal. So, + * if it happens during parallel lock, we won't attempt a + * sequential one unless there are EAGAIN errors on other + * bricks (and are enough to form a quorum), but if we reach + * quorum counting the ESTALE bricks, we consider the whole + * result of the operation is ESTALE instead of EIO. + */ + + list_for_each_entry(ans, &fop->cbk_list, list) + { + if (ans->op_ret >= 0) { + if (locked != 0) { + error = EIO; + } + locked |= ans->mask; + good = ans->count; + cbk = ans; + } else if (ans->op_errno == ESTALE) { + estale += ans->count; + } else if ((ans->op_errno == EAGAIN) && + (fop->uint32 != EC_LOCK_MODE_INC)) { + eagain += ans->count; + } + } + + if (error == -1) { + /* If we have enough quorum with succeeded and EAGAIN answers, we + * ignore for now any ESTALE answer. If there are EAGAIN answers, + * we retry with a sequential blocking lock request if needed. + * Otherwise we succeed. */ + if ((good + eagain) >= ec->fragments) { + if (eagain == 0) { + if (fop->answer == NULL) { + fop->answer = cbk; + } + + ec_update_good(fop, locked); + + error = 0; + } else { + switch (fop->uint32) { + case EC_LOCK_MODE_NONE: + error = EAGAIN; + break; + case EC_LOCK_MODE_ALL: + fop->uint32 = EC_LOCK_MODE_INC; + break; + default: + /* This shouldn't happen because eagain cannot be > 0 + * when fop->uint32 is EC_LOCK_MODE_INC. */ + error = EIO; + break; + } + } + } else { + /* We have been unable to find enough candidates that will be able + * to take the lock. If we have quorum on some answer, we return + * it. Otherwise we check if ESTALE answers allow us to reach + * quorum. If so, we return ESTALE. */ + if (fop->answer && fop->answer->op_ret < 0) { + error = fop->answer->op_errno; + } else if ((good + eagain + estale) >= ec->fragments) { + error = ESTALE; + } else { + error = EIO; + } + } + } + + *mask = locked; + + return error; +} + +int32_t +ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_UNLOCK_FAILED, + "Failed to unlock an entry/inode"); + } + + return 0; +} + +int32_t +ec_lock_lk_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *flock, + dict_t *xdata) +{ + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_LK_UNLOCK_FAILED, + "Failed to unlock an lk"); + } + + return 0; +} + +/* FOP: entrylk */ + +int32_t +ec_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_ENTRYLK, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + ec_combine(cbk, NULL); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_entrylk(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_entrylk_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->entrylk, + fop->str[0], &fop->loc[0], fop->str[1], fop->entrylk_cmd, + fop->entrylk_type, fop->xdata); +} + +int32_t +ec_manager_entrylk(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + if (fop->entrylk_cmd == ENTRYLK_LOCK) { + fop->uint32 = EC_LOCK_MODE_ALL; + fop->entrylk_cmd = ENTRYLK_LOCK_NB; + } + + /* Fall through */ + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + case -EC_STATE_PREPARE_ANSWER: + if (fop->entrylk_cmd != ENTRYLK_UNLOCK) { + uintptr_t mask; + + ec_fop_set_error(fop, ec_lock_check(fop, &mask)); + if (fop->error != 0) { + if (mask != 0) { + if (fop->id == GF_FOP_ENTRYLK) { + ec_entrylk( + fop->frame, fop->xl, mask, 1, ec_lock_unlocked, + NULL, fop->str[0], &fop->loc[0], fop->str[1], + ENTRYLK_UNLOCK, fop->entrylk_type, fop->xdata); + } else { + ec_fentrylk(fop->frame, fop->xl, mask, 1, + ec_lock_unlocked, NULL, fop->str[0], + fop->fd, fop->str[1], ENTRYLK_UNLOCK, + fop->entrylk_type, fop->xdata); + } + } + if (fop->error < 0) { + fop->error = 0; + + fop->entrylk_cmd = ENTRYLK_LOCK; + + ec_dispatch_inc(fop); + + return EC_STATE_PREPARE_ANSWER; + } + } + } else { + ec_fop_prepare_answer(fop, _gf_true); + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->id == GF_FOP_ENTRYLK) { + if (fop->cbks.entrylk != NULL) { + fop->cbks.entrylk(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, cbk->xdata); + } + } else { + if (fop->cbks.fentrylk != NULL) { + fop->cbks.fentrylk(fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, cbk->xdata); + } + } + + return EC_STATE_END; + + case -EC_STATE_INIT: + case -EC_STATE_DISPATCH: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->id == GF_FOP_ENTRYLK) { + if (fop->cbks.entrylk != NULL) { + fop->cbks.entrylk(fop->req_frame, fop, fop->xl, -1, + fop->error, NULL); + } + } else { + if (fop->cbks.fentrylk != NULL) { + fop->cbks.fentrylk(fop->req_frame, fop, fop->xl, -1, + fop->error, NULL); + } + } + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_entrylk(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_entrylk_cbk_t func, void *data, + const char *volume, loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +{ + ec_cbk_t callback = {.entrylk = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(ENTRYLK) %p", frame); + + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_ENTRYLK, 0, target, + fop_flags, ec_wind_entrylk, ec_manager_entrylk, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->entrylk_cmd = cmd; + fop->entrylk_type = type; + + if (volume != NULL) { + fop->str[0] = gf_strdup(volume); + if (fop->str[0] == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to duplicate a string."); + + goto out; + } + } + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (basename != NULL) { + fop->str[1] = gf_strdup(basename); + if (fop->str[1] == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to duplicate a string."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL); + } +} + +/* FOP: fentrylk */ + +int32_t +ec_fentrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FENTRYLK, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + ec_combine(cbk, NULL); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_fentrylk(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_fentrylk_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->fentrylk, + fop->str[0], fop->fd, fop->str[1], fop->entrylk_cmd, + fop->entrylk_type, fop->xdata); +} + +void +ec_fentrylk(call_frame_t *frame, xlator_t *this, uintptr_t target, + uint32_t fop_flags, fop_fentrylk_cbk_t func, void *data, + const char *volume, fd_t *fd, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) +{ + ec_cbk_t callback = {.fentrylk = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(FENTRYLK) %p", frame); + + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_FENTRYLK, 0, target, + fop_flags, ec_wind_fentrylk, ec_manager_entrylk, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + fop->entrylk_cmd = cmd; + fop->entrylk_type = type; + + if (volume != NULL) { + fop->str[0] = gf_strdup(volume); + if (fop->str[0] == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to duplicate a string."); + + goto out; + } + } + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (basename != NULL) { + fop->str[1] = gf_strdup(basename); + if (fop->str[1] == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to duplicate a string."); + + goto out; + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL); + } +} + +/* FOP: inodelk */ + +int32_t +ec_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_INODELK, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + ec_combine(cbk, NULL); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_inodelk(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_inodelk_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->inodelk, + fop->str[0], &fop->loc[0], fop->int32, &fop->flock, + fop->xdata); +} + +int32_t +ec_manager_inodelk(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + fop->flock.l_len += ec_adjust_offset_down( + fop->xl->private, &fop->flock.l_start, _gf_true); + ec_adjust_offset_up(fop->xl->private, &fop->flock.l_len, _gf_true); + if ((fop->int32 == F_SETLKW) && (fop->flock.l_type != F_UNLCK)) { + fop->uint32 = EC_LOCK_MODE_ALL; + fop->int32 = F_SETLK; + } + + /* Fall through */ + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + case -EC_STATE_PREPARE_ANSWER: + if (fop->flock.l_type != F_UNLCK) { + uintptr_t mask; + + ec_fop_set_error(fop, ec_lock_check(fop, &mask)); + if (fop->error != 0) { + if (mask != 0) { + ec_t *ec = fop->xl->private; + struct gf_flock flock; + + flock.l_type = F_UNLCK; + flock.l_whence = fop->flock.l_whence; + flock.l_start = fop->flock.l_start * ec->fragments; + flock.l_len = fop->flock.l_len * ec->fragments; + flock.l_pid = 0; + flock.l_owner.len = 0; + + if (fop->id == GF_FOP_INODELK) { + ec_inodelk(fop->frame, fop->xl, + &fop->frame->root->lk_owner, mask, 1, + ec_lock_unlocked, NULL, fop->str[0], + &fop->loc[0], F_SETLK, &flock, + fop->xdata); + } else { + ec_finodelk(fop->frame, fop->xl, + &fop->frame->root->lk_owner, mask, 1, + ec_lock_unlocked, NULL, fop->str[0], + fop->fd, F_SETLK, &flock, fop->xdata); + } + } + if (fop->error < 0) { + fop->error = 0; + + fop->int32 = F_SETLKW; + + ec_dispatch_inc(fop); + + return EC_STATE_PREPARE_ANSWER; + } + } + } else { + ec_fop_prepare_answer(fop, _gf_true); + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->id == GF_FOP_INODELK) { + if (fop->cbks.inodelk != NULL) { + fop->cbks.inodelk(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, cbk->xdata); + } + } else { + if (fop->cbks.finodelk != NULL) { + fop->cbks.finodelk(fop->req_frame, fop, fop->xl, + cbk->op_ret, cbk->op_errno, cbk->xdata); + } + } + + return EC_STATE_END; + + case -EC_STATE_INIT: + case -EC_STATE_DISPATCH: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->id == GF_FOP_INODELK) { + if (fop->cbks.inodelk != NULL) { + fop->cbks.inodelk(fop->req_frame, fop, fop->xl, -1, + fop->error, NULL); + } + } else { + if (fop->cbks.finodelk != NULL) { + fop->cbks.finodelk(fop->req_frame, fop, fop->xl, -1, + fop->error, NULL); + } + } + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_inodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner, + uintptr_t target, uint32_t fop_flags, fop_inodelk_cbk_t func, + void *data, const char *volume, loc_t *loc, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) +{ + ec_cbk_t callback = {.inodelk = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(INODELK) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_INODELK, 0, target, + fop_flags, ec_wind_inodelk, ec_manager_inodelk, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->int32 = cmd; + ec_owner_copy(fop->frame, owner); + + if (volume != NULL) { + fop->str[0] = gf_strdup(volume); + if (fop->str[0] == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to duplicate a string."); + + goto out; + } + } + if (loc != NULL) { + if (loc_copy(&fop->loc[0], loc) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL, + "Failed to copy a location."); + + goto out; + } + } + if (flock != NULL) { + fop->flock.l_type = flock->l_type; + fop->flock.l_whence = flock->l_whence; + fop->flock.l_start = flock->l_start; + fop->flock.l_len = flock->l_len; + fop->flock.l_pid = flock->l_pid; + fop->flock.l_owner.len = flock->l_owner.len; + if (flock->l_owner.len > 0) { + memcpy(fop->flock.l_owner.data, flock->l_owner.data, + flock->l_owner.len); + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL); + } +} + +/* FOP: finodelk */ + +int32_t +ec_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FINODELK, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + ec_combine(cbk, NULL); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_finodelk(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_finodelk_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->finodelk, + fop->str[0], fop->fd, fop->int32, &fop->flock, + fop->xdata); +} + +void +ec_finodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner, + uintptr_t target, uint32_t fop_flags, fop_finodelk_cbk_t func, + void *data, const char *volume, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) +{ + ec_cbk_t callback = {.finodelk = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(FINODELK) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_FINODELK, 0, target, + fop_flags, ec_wind_finodelk, ec_manager_inodelk, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + fop->int32 = cmd; + ec_owner_copy(fop->frame, owner); + + if (volume != NULL) { + fop->str[0] = gf_strdup(volume); + if (fop->str[0] == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to duplicate a string."); + + goto out; + } + } + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (flock != NULL) { + fop->flock.l_type = flock->l_type; + fop->flock.l_whence = flock->l_whence; + fop->flock.l_start = flock->l_start; + fop->flock.l_len = flock->l_len; + fop->flock.l_pid = flock->l_pid; + fop->flock.l_owner.len = flock->l_owner.len; + if (flock->l_owner.len > 0) { + memcpy(fop->flock.l_owner.data, flock->l_owner.data, + flock->l_owner.len); + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL); + } +} + +/* FOP: lk */ + +int32_t +ec_combine_lk(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src) +{ + if (!ec_flock_compare(&dst->flock, &src->flock)) { + gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_LOCK_MISMATCH, + "Mismatching lock in " + "answers of 'GF_FOP_LK'"); + + return 0; + } + + return 1; +} + +int32_t +ec_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_flock *flock, dict_t *xdata) +{ + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = frame->local; + + ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame, + op_ret, op_errno); + + cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_LK, idx, op_ret, + op_errno); + if (cbk != NULL) { + if (op_ret >= 0) { + if (flock != NULL) { + cbk->flock.l_type = flock->l_type; + cbk->flock.l_whence = flock->l_whence; + cbk->flock.l_start = flock->l_start; + cbk->flock.l_len = flock->l_len; + cbk->flock.l_pid = flock->l_pid; + cbk->flock.l_owner.len = flock->l_owner.len; + if (flock->l_owner.len > 0) { + memcpy(cbk->flock.l_owner.data, flock->l_owner.data, + flock->l_owner.len); + } + } + } + if (xdata != NULL) { + cbk->xdata = dict_ref(xdata); + if (cbk->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + ec_combine(cbk, ec_combine_lk); + } + +out: + if (fop != NULL) { + ec_complete(fop); + } + + return 0; +} + +void +ec_wind_lk(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_lk_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->lk, fop->fd, + fop->int32, &fop->flock, fop->xdata); +} + +int32_t +ec_manager_lk(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk; + + switch (state) { + case EC_STATE_INIT: + if ((fop->int32 == F_SETLKW) && (fop->flock.l_type != F_UNLCK)) { + fop->uint32 = EC_LOCK_MODE_ALL; + fop->int32 = F_SETLK; + } + + /* Fall through */ + + case EC_STATE_DISPATCH: + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + case -EC_STATE_PREPARE_ANSWER: + if (fop->flock.l_type != F_UNLCK) { + uintptr_t mask; + + ec_fop_set_error(fop, ec_lock_check(fop, &mask)); + if (fop->error != 0) { + if (mask != 0) { + struct gf_flock flock = {0}; + + flock.l_type = F_UNLCK; + flock.l_whence = fop->flock.l_whence; + flock.l_start = fop->flock.l_start; + flock.l_len = fop->flock.l_len; + flock.l_pid = fop->flock.l_pid; + lk_owner_copy(&flock.l_owner, &fop->flock.l_owner); + + ec_lk(fop->frame, fop->xl, mask, 1, ec_lock_lk_unlocked, + NULL, fop->fd, F_SETLK, &flock, fop->xdata); + } + + if (fop->error < 0) { + fop->error = 0; + + fop->int32 = F_SETLKW; + + ec_dispatch_inc(fop); + + return EC_STATE_PREPARE_ANSWER; + } + } + } else { + ec_fop_prepare_answer(fop, _gf_true); + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.lk != NULL) { + fop->cbks.lk(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, &cbk->flock, cbk->xdata); + } + + return EC_STATE_END; + + case -EC_STATE_INIT: + case -EC_STATE_DISPATCH: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.lk != NULL) { + fop->cbks.lk(fop->req_frame, fop, fop->xl, -1, fop->error, NULL, + NULL); + } + + return EC_STATE_END; + + default: + gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void +ec_lk(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags, + fop_lk_cbk_t func, void *data, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) +{ + ec_cbk_t callback = {.lk = func}; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace("ec", 0, "EC(LK) %p", frame); + + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_LK, 0, target, fop_flags, + ec_wind_lk, ec_manager_lk, callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + + fop->int32 = cmd; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + + goto out; + } + } + if (flock != NULL) { + fop->flock.l_type = flock->l_type; + fop->flock.l_whence = flock->l_whence; + fop->flock.l_start = flock->l_start; + fop->flock.l_len = flock->l_len; + fop->flock.l_pid = flock->l_pid; + fop->flock.l_owner.len = flock->l_owner.len; + if (flock->l_owner.len > 0) { + memcpy(fop->flock.l_owner.data, flock->l_owner.data, + flock->l_owner.len); + } + } + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL); + } +} diff --git a/xlators/cluster/ec/src/ec-mem-types.h b/xlators/cluster/ec/src/ec-mem-types.h new file mode 100644 index 00000000000..3252c4c1c58 --- /dev/null +++ b/xlators/cluster/ec/src/ec-mem-types.h @@ -0,0 +1,30 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_MEM_TYPES_H__ +#define __EC_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> + +enum gf_ec_mem_types_ { + ec_mt_ec_t = gf_common_mt_end + 1, + ec_mt_xlator_t, + ec_mt_ec_inode_t, + ec_mt_ec_fd_t, + ec_mt_subvol_healer_t, + ec_mt_ec_gf_t, + ec_mt_ec_code_t, + ec_mt_ec_code_builder_t, + ec_mt_ec_matrix_t, + ec_mt_ec_stripe_t, + ec_mt_end +}; + +#endif /* __EC_MEM_TYPES_H__ */ diff --git a/xlators/cluster/ec/src/ec-messages.h b/xlators/cluster/ec/src/ec-messages.h new file mode 100644 index 00000000000..72e98f11286 --- /dev/null +++ b/xlators/cluster/ec/src/ec-messages.h @@ -0,0 +1,61 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _EC_MESSAGES_H_ +#define _EC_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(EC, EC_MSG_INVALID_CONFIG, EC_MSG_HEAL_FAIL, + EC_MSG_DICT_COMBINE_FAIL, EC_MSG_STIME_COMBINE_FAIL, + EC_MSG_INVALID_DICT_NUMS, EC_MSG_IATT_COMBINE_FAIL, + EC_MSG_INVALID_FORMAT, EC_MSG_DICT_GET_FAILED, + EC_MSG_UNHANDLED_STATE, EC_MSG_FILE_DESC_REF_FAIL, + EC_MSG_LOC_COPY_FAIL, EC_MSG_BUF_REF_FAIL, EC_MSG_DICT_REF_FAIL, + EC_MSG_LK_UNLOCK_FAILED, EC_MSG_UNLOCK_FAILED, + EC_MSG_LOC_PARENT_INODE_MISSING, EC_MSG_INVALID_LOC_NAME, + EC_MSG_NO_MEMORY, EC_MSG_GFID_MISMATCH, EC_MSG_UNSUPPORTED_VERSION, + EC_MSG_FD_CREATE_FAIL, EC_MSG_READDIRP_REQ_PREP_FAIL, + EC_MSG_LOOKUP_REQ_PREP_FAIL, EC_MSG_INODE_REF_FAIL, + EC_MSG_LOOKUP_READAHEAD_FAIL, EC_MSG_FRAME_MISMATCH, + EC_MSG_XLATOR_MISMATCH, EC_MSG_VECTOR_MISMATCH, EC_MSG_IATT_MISMATCH, + EC_MSG_FD_MISMATCH, EC_MSG_DICT_MISMATCH, EC_MSG_INDEX_DIR_GET_FAIL, + EC_MSG_PREOP_LOCK_FAILED, EC_MSG_CHILDS_INSUFFICIENT, + EC_MSG_OP_EXEC_UNAVAIL, EC_MSG_UNLOCK_DELAY_FAILED, + EC_MSG_SIZE_VERS_UPDATE_FAIL, EC_MSG_INVALID_REQUEST, + EC_MSG_INVALID_LOCK_TYPE, EC_MSG_SIZE_VERS_GET_FAIL, + EC_MSG_FILE_SIZE_GET_FAIL, EC_MSG_FOP_MISMATCH, + EC_MSG_SUBVOL_ID_DICT_SET_FAIL, EC_MSG_SUBVOL_BUILD_FAIL, + EC_MSG_XLATOR_INIT_FAIL, EC_MSG_NO_PARENTS, EC_MSG_TIMER_CREATE_FAIL, + EC_MSG_TOO_MANY_SUBVOLS, EC_MSG_DATA_UNAVAILABLE, + EC_MSG_INODE_REMOVE_FAIL, EC_MSG_INVALID_REDUNDANCY, + EC_MSG_XLATOR_PARSE_OPT_FAIL, EC_MSG_OP_FAIL_ON_SUBVOLS, + EC_MSG_INVALID_INODE, EC_MSG_LOCK_MISMATCH, EC_MSG_XDATA_MISMATCH, + EC_MSG_HEALING_INFO, EC_MSG_HEAL_SUCCESS, EC_MSG_FULL_SWEEP_START, + EC_MSG_FULL_SWEEP_STOP, EC_MSG_INVALID_FOP, EC_MSG_EC_UP, + EC_MSG_EC_DOWN, EC_MSG_SIZE_XATTR_GET_FAIL, + EC_MSG_VER_XATTR_GET_FAIL, EC_MSG_CONFIG_XATTR_GET_FAIL, + EC_MSG_CONFIG_XATTR_INVALID, EC_MSG_EXTENSION, EC_MSG_EXTENSION_NONE, + EC_MSG_EXTENSION_UNKNOWN, EC_MSG_EXTENSION_UNSUPPORTED, + EC_MSG_EXTENSION_FAILED, EC_MSG_NO_GF, EC_MSG_MATRIX_FAILED, + EC_MSG_DYN_CREATE_FAILED, EC_MSG_DYN_CODEGEN_FAILED, + EC_MSG_THREAD_CLEANUP_FAILED, EC_MSG_FD_BAD); + +#endif /* !_EC_MESSAGES_H_ */ diff --git a/xlators/cluster/ec/src/ec-method.c b/xlators/cluster/ec/src/ec-method.c new file mode 100644 index 00000000000..55faed0b193 --- /dev/null +++ b/xlators/cluster/ec/src/ec-method.c @@ -0,0 +1,433 @@ +/* + Copyright (c) 2012-2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <string.h> +#include <inttypes.h> + +#include "ec-types.h" +#include "ec-mem-types.h" +#include "ec-galois.h" +#include "ec-code.h" +#include "ec-method.h" +#include "ec-helpers.h" + +static void +ec_method_matrix_normal(ec_gf_t *gf, uint32_t *matrix, uint32_t columns, + uint32_t *values, uint32_t count) +{ + uint32_t i, j, v, tmp; + + columns--; + for (i = 0; i < count; i++) { + v = *values++; + *matrix++ = tmp = ec_gf_exp(gf, v, columns); + for (j = 0; j < columns; j++) { + *matrix++ = tmp = ec_gf_div(gf, tmp, v); + } + } +} + +static void +ec_method_matrix_inverse(ec_gf_t *gf, uint32_t *matrix, uint32_t *values, + uint32_t count) +{ + uint32_t a[count]; + uint32_t i, j, p, last, tmp; + + last = count - 1; + for (i = 0; i < last; i++) { + a[i] = 1; + } + a[i] = values[0]; + for (i = last; i > 0; i--) { + for (j = i - 1; j < last; j++) { + a[j] = a[j + 1] ^ ec_gf_mul(gf, values[i], a[j]); + } + a[j] = ec_gf_mul(gf, values[i], a[j]); + } + for (i = 0; i < count; i++) { + p = a[0]; + matrix += count; + *matrix = tmp = p ^ values[i]; + for (j = 1; j < last; j++) { + matrix += count; + *matrix = tmp = a[j] ^ ec_gf_mul(gf, values[i], tmp); + p = tmp ^ ec_gf_mul(gf, values[i], p); + } + for (j = 0; j < last; j++) { + *matrix = ec_gf_div(gf, *matrix, p); + matrix -= count; + } + *matrix = ec_gf_div(gf, 1, p); + matrix++; + } +} + +static void +ec_method_matrix_init(ec_matrix_list_t *list, ec_matrix_t *matrix, + uintptr_t mask, uint32_t *rows, gf_boolean_t inverse) +{ + uint32_t i; + + matrix->refs = 1; + matrix->mask = mask; + matrix->code = list->code; + matrix->columns = list->columns; + INIT_LIST_HEAD(&matrix->lru); + + if (inverse) { + matrix->rows = list->columns; + ec_method_matrix_inverse(matrix->code->gf, matrix->values, rows, + matrix->rows); + for (i = 0; i < matrix->rows; i++) { + matrix->row_data[i].values = matrix->values + i * matrix->columns; + matrix->row_data[i].func.interleaved = ec_code_build_interleaved( + matrix->code, EC_METHOD_WORD_SIZE, matrix->row_data[i].values, + matrix->columns); + } + } else { + matrix->rows = list->rows; + ec_method_matrix_normal(matrix->code->gf, matrix->values, + matrix->columns, rows, matrix->rows); + for (i = 0; i < matrix->rows; i++) { + matrix->row_data[i].values = matrix->values + i * matrix->columns; + matrix->row_data[i].func.linear = ec_code_build_linear( + matrix->code, EC_METHOD_WORD_SIZE, matrix->row_data[i].values, + matrix->columns); + } + } +} + +static void +ec_method_matrix_release(ec_matrix_t *matrix) +{ + uint32_t i; + + for (i = 0; i < matrix->rows; i++) { + if (matrix->row_data[i].func.linear != NULL) { + ec_code_release(matrix->code, &matrix->row_data[i].func); + matrix->row_data[i].func.linear = NULL; + } + } +} + +static void +ec_method_matrix_destroy(ec_matrix_list_t *list, ec_matrix_t *matrix) +{ + list_del_init(&matrix->lru); + + ec_method_matrix_release(matrix); + + mem_put(matrix); + + list->count--; +} + +static void +ec_method_matrix_unref(ec_matrix_list_t *list, ec_matrix_t *matrix) +{ + if (--matrix->refs == 0) { + list_add_tail(&matrix->lru, &list->lru); + if (list->count > list->max) { + matrix = list_first_entry(&list->lru, ec_matrix_t, lru); + ec_method_matrix_destroy(list, matrix); + } + } +} + +static ec_matrix_t * +ec_method_matrix_lookup(ec_matrix_list_t *list, uintptr_t mask, uint32_t *pos) +{ + ec_matrix_t *matrix; + uint32_t i, j, k; + + i = 0; + j = list->count; + while (i < j) { + k = (i + j) >> 1; + matrix = list->objects[k]; + if (matrix->mask == mask) { + *pos = k; + return matrix; + } + if (matrix->mask < mask) { + i = k + 1; + } else { + j = k; + } + } + *pos = i; + + return NULL; +} + +static void +ec_method_matrix_remove(ec_matrix_list_t *list, uintptr_t mask) +{ + uint32_t pos; + + if (ec_method_matrix_lookup(list, mask, &pos) != NULL) { + list->count--; + if (pos < list->count) { + memmove(list->objects + pos, list->objects + pos + 1, + sizeof(ec_matrix_t *) * (list->count - pos)); + } + } +} + +static void +ec_method_matrix_insert(ec_matrix_list_t *list, ec_matrix_t *matrix) +{ + uint32_t pos; + + GF_ASSERT(ec_method_matrix_lookup(list, matrix->mask, &pos) == NULL); + + if (pos < list->count) { + memmove(list->objects + pos + 1, list->objects + pos, + sizeof(ec_matrix_t *) * (list->count - pos)); + } + list->objects[pos] = matrix; + list->count++; +} + +static ec_matrix_t * +ec_method_matrix_get(ec_matrix_list_t *list, uintptr_t mask, uint32_t *rows) +{ + ec_matrix_t *matrix; + uint32_t pos; + + LOCK(&list->lock); + + matrix = ec_method_matrix_lookup(list, mask, &pos); + if (matrix != NULL) { + list_del_init(&matrix->lru); + matrix->refs++; + + goto out; + } + + if ((list->count >= list->max) && !list_empty(&list->lru)) { + matrix = list_first_entry(&list->lru, ec_matrix_t, lru); + list_del_init(&matrix->lru); + + ec_method_matrix_remove(list, matrix->mask); + + ec_method_matrix_release(matrix); + } else { + matrix = mem_get0(list->pool); + if (matrix == NULL) { + matrix = EC_ERR(ENOMEM); + goto out; + } + matrix->values = (uint32_t *)((uintptr_t)matrix + sizeof(ec_matrix_t) + + sizeof(ec_matrix_row_t) * list->columns); + } + + ec_method_matrix_init(list, matrix, mask, rows, _gf_true); + + if (list->count < list->max) { + ec_method_matrix_insert(list, matrix); + } else { + matrix->mask = 0; + } + +out: + UNLOCK(&list->lock); + + return matrix; +} + +static void +ec_method_matrix_put(ec_matrix_list_t *list, ec_matrix_t *matrix) +{ + LOCK(&list->lock); + + ec_method_matrix_unref(list, matrix); + + UNLOCK(&list->lock); +} + +static int32_t +ec_method_setup(xlator_t *xl, ec_matrix_list_t *list, const char *gen) +{ + ec_matrix_t *matrix; + uint32_t values[list->rows]; + uint32_t i; + int32_t err; + + matrix = GF_MALLOC(sizeof(ec_matrix_t) + + sizeof(ec_matrix_row_t) * list->rows + + sizeof(uint32_t) * list->columns * list->rows, + ec_mt_ec_matrix_t); + if (matrix == NULL) { + err = -ENOMEM; + goto failed; + } + memset(matrix, 0, sizeof(ec_matrix_t)); + matrix->values = (uint32_t *)((uintptr_t)matrix + sizeof(ec_matrix_t) + + sizeof(ec_matrix_row_t) * list->rows); + + list->code = ec_code_create(list->gf, ec_code_detect(xl, gen)); + if (EC_IS_ERR(list->code)) { + err = EC_GET_ERR(list->code); + list->code = NULL; + goto failed_matrix; + } + + for (i = 0; i < list->rows; i++) { + values[i] = i + 1; + } + ec_method_matrix_init(list, matrix, 0, values, _gf_false); + + list->encode = matrix; + + return 0; + +failed_matrix: + GF_FREE(matrix); +failed: + return err; +} + +int32_t +ec_method_init(xlator_t *xl, ec_matrix_list_t *list, uint32_t columns, + uint32_t rows, uint32_t max, const char *gen) +{ + list->columns = columns; + list->rows = rows; + list->max = max; + list->stripe = EC_METHOD_CHUNK_SIZE * list->columns; + INIT_LIST_HEAD(&list->lru); + int32_t err; + + list->pool = mem_pool_new_fn(xl->ctx, + sizeof(ec_matrix_t) + + sizeof(ec_matrix_row_t) * columns + + sizeof(uint32_t) * columns * columns, + 128, "ec_matrix_t"); + if (list->pool == NULL) { + err = -ENOMEM; + goto failed; + } + + list->objects = GF_MALLOC(sizeof(ec_matrix_t *) * max, ec_mt_ec_matrix_t); + if (list->objects == NULL) { + err = -ENOMEM; + goto failed_pool; + } + + list->gf = ec_gf_prepare(EC_GF_BITS, EC_GF_MOD); + if (EC_IS_ERR(list->gf)) { + err = EC_GET_ERR(list->gf); + goto failed_objects; + } + + err = ec_method_setup(xl, list, gen); + if (err != 0) { + goto failed_gf; + } + + LOCK_INIT(&list->lock); + + return 0; + +failed_gf: + ec_gf_destroy(list->gf); +failed_objects: + GF_FREE(list->objects); +failed_pool: + mem_pool_destroy(list->pool); +failed: + list->pool = NULL; + list->objects = NULL; + list->gf = NULL; + + return err; +} + +void +ec_method_fini(ec_matrix_list_t *list) +{ + ec_matrix_t *matrix; + + if (list->encode == NULL) { + return; + } + + while (!list_empty(&list->lru)) { + matrix = list_first_entry(&list->lru, ec_matrix_t, lru); + ec_method_matrix_destroy(list, matrix); + } + + GF_ASSERT(list->count == 0); + + if (list->pool) /*Init was successful*/ + LOCK_DESTROY(&list->lock); + + ec_method_matrix_release(list->encode); + GF_FREE(list->encode); + + ec_code_destroy(list->code); + ec_gf_destroy(list->gf); + GF_FREE(list->objects); + + if (list->pool) + mem_pool_destroy(list->pool); +} + +int32_t +ec_method_update(xlator_t *xl, ec_matrix_list_t *list, const char *gen) +{ + /* TODO: Allow changing code generator */ + + return 0; +} + +void +ec_method_encode(ec_matrix_list_t *list, uint64_t size, void *in, void **out) +{ + ec_matrix_t *matrix; + uint64_t pos; + uint32_t i; + + matrix = list->encode; + for (pos = 0; pos < size; pos += list->stripe) { + for (i = 0; i < matrix->rows; i++) { + matrix->row_data[i].func.linear( + out[i], in, pos, matrix->row_data[i].values, list->columns); + out[i] += EC_METHOD_CHUNK_SIZE; + } + } +} + +int32_t +ec_method_decode(ec_matrix_list_t *list, uint64_t size, uintptr_t mask, + uint32_t *rows, void **in, void *out) +{ + ec_matrix_t *matrix; + uint64_t pos; + uint32_t i; + + matrix = ec_method_matrix_get(list, mask, rows); + if (EC_IS_ERR(matrix)) { + return EC_GET_ERR(matrix); + } + for (pos = 0; pos < size; pos += EC_METHOD_CHUNK_SIZE) { + for (i = 0; i < matrix->rows; i++) { + matrix->row_data[i].func.interleaved( + out, in, pos, matrix->row_data[i].values, list->columns); + out += EC_METHOD_CHUNK_SIZE; + } + } + + ec_method_matrix_put(list, matrix); + + return 0; +} diff --git a/xlators/cluster/ec/src/ec-method.h b/xlators/cluster/ec/src/ec-method.h new file mode 100644 index 00000000000..f91233b2f88 --- /dev/null +++ b/xlators/cluster/ec/src/ec-method.h @@ -0,0 +1,48 @@ +/* + Copyright (c) 2012-2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_METHOD_H__ +#define __EC_METHOD_H__ + +#include "ec-types.h" +#include "ec-galois.h" + +#define EC_GF_BITS 8 +#define EC_GF_MOD 0x11D + +#define EC_GF_SIZE (1 << EC_GF_BITS) + +/* Determines the maximum size of the matrix used to encode/decode data */ +#define EC_METHOD_MAX_FRAGMENTS 16 +/* Determines the maximum number of usable elements in the Galois Field */ +#define EC_METHOD_MAX_NODES (EC_GF_SIZE - 1) + +#define EC_METHOD_WORD_SIZE 64 + +#define EC_METHOD_CHUNK_SIZE (EC_METHOD_WORD_SIZE * EC_GF_BITS) + +int32_t +ec_method_init(xlator_t *xl, ec_matrix_list_t *list, uint32_t columns, + uint32_t rows, uint32_t max, const char *gen); + +void +ec_method_fini(ec_matrix_list_t *list); + +int32_t +ec_method_update(xlator_t *xl, ec_matrix_list_t *list, const char *gen); + +void +ec_method_encode(ec_matrix_list_t *list, uint64_t size, void *in, void **out); + +int32_t +ec_method_decode(ec_matrix_list_t *list, uint64_t size, uintptr_t mask, + uint32_t *rows, void **in, void *out); + +#endif /* __EC_METHOD_H__ */ diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h new file mode 100644 index 00000000000..de9b89bb2c9 --- /dev/null +++ b/xlators/cluster/ec/src/ec-types.h @@ -0,0 +1,690 @@ +/* + Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_TYPES_H__ +#define __EC_TYPES_H__ + +#include <glusterfs/xlator.h> +#include <glusterfs/timer.h> +#include "libxlator.h" +#include <glusterfs/atomic.h> + +#define EC_GF_MAX_REGS 16 + +enum _ec_heal_need; +typedef enum _ec_heal_need ec_heal_need_t; + +enum _ec_stripe_part; +typedef enum _ec_stripe_part ec_stripe_part_t; + +enum _ec_read_policy; +typedef enum _ec_read_policy ec_read_policy_t; + +struct _ec_config; +typedef struct _ec_config ec_config_t; + +struct _ec_fd; +typedef struct _ec_fd ec_fd_t; + +struct _ec_fragment_range; +typedef struct _ec_fragment_range ec_fragment_range_t; + +struct _ec_inode; +typedef struct _ec_inode ec_inode_t; + +union _ec_cbk; +typedef union _ec_cbk ec_cbk_t; + +struct _ec_lock; +typedef struct _ec_lock ec_lock_t; + +struct _ec_lock_link; +typedef struct _ec_lock_link ec_lock_link_t; + +struct _ec_fop_data; +typedef struct _ec_fop_data ec_fop_data_t; + +struct _ec_cbk_data; +typedef struct _ec_cbk_data ec_cbk_data_t; + +enum _ec_gf_opcode; +typedef enum _ec_gf_opcode ec_gf_opcode_t; + +struct _ec_gf_op; +typedef struct _ec_gf_op ec_gf_op_t; + +struct _ec_gf_mul; +typedef struct _ec_gf_mul ec_gf_mul_t; + +struct _ec_gf; +typedef struct _ec_gf ec_gf_t; + +struct _ec_code_gen; +typedef struct _ec_code_gen ec_code_gen_t; + +struct _ec_code; +typedef struct _ec_code ec_code_t; + +struct _ec_code_arg; +typedef struct _ec_code_arg ec_code_arg_t; + +struct _ec_code_op; +typedef struct _ec_code_op ec_code_op_t; + +struct _ec_code_builder; +typedef struct _ec_code_builder ec_code_builder_t; + +struct _ec_code_chunk; +typedef struct _ec_code_chunk ec_code_chunk_t; + +struct _ec_stripe; +typedef struct _ec_stripe ec_stripe_t; + +struct _ec_stripe_list; +typedef struct _ec_stripe_list ec_stripe_list_t; + +struct _ec_code_space; +typedef struct _ec_code_space ec_code_space_t; + +typedef void (*ec_code_func_linear_t)(void *dst, void *src, uint64_t offset, + uint32_t *values, uint32_t count); + +typedef void (*ec_code_func_interleaved_t)(void *dst, void **src, + uint64_t offset, uint32_t *values, + uint32_t count); + +union _ec_code_func; +typedef union _ec_code_func ec_code_func_t; + +struct _ec_matrix_row; +typedef struct _ec_matrix_row ec_matrix_row_t; + +struct _ec_matrix; +typedef struct _ec_matrix ec_matrix_t; + +struct _ec_matrix_list; +typedef struct _ec_matrix_list ec_matrix_list_t; + +struct _ec_heal; +typedef struct _ec_heal ec_heal_t; + +struct _ec_self_heald; +typedef struct _ec_self_heald ec_self_heald_t; + +struct _ec_statistics; +typedef struct _ec_statistics ec_statistics_t; + +struct _ec; +typedef struct _ec ec_t; + +typedef void (*ec_wind_f)(ec_t *, ec_fop_data_t *, int32_t); +typedef int32_t (*ec_handler_f)(ec_fop_data_t *, int32_t); +typedef void (*ec_resume_f)(ec_fop_data_t *, int32_t); + +enum _ec_read_policy { EC_ROUND_ROBIN, EC_GFID_HASH, EC_READ_POLICY_MAX }; + +enum _ec_heal_need { + EC_HEAL_NONEED, + EC_HEAL_MAYBE, + EC_HEAL_MUST, + EC_HEAL_PURGE_INDEX +}; + +enum _ec_stripe_part { EC_STRIPE_HEAD, EC_STRIPE_TAIL }; + +/* Enumartions to indicate FD status. */ +typedef enum { EC_FD_NOT_OPENED, EC_FD_OPENED, EC_FD_OPENING } ec_fd_status_t; + +struct _ec_config { + uint32_t version; + uint8_t algorithm; + uint8_t gf_word_size; + uint8_t bricks; + uint8_t redundancy; + uint32_t chunk_size; +}; + +struct _ec_fd { + loc_t loc; + uintptr_t open; + int32_t flags; + uint64_t bad_version; + ec_fd_status_t fd_status[0]; +}; + +struct _ec_stripe { + struct list_head lru; /* LRU list member */ + uint64_t frag_offset; /* Fragment offset of this stripe */ + char data[]; /* Contents of the stripe */ +}; + +struct _ec_stripe_list { + struct list_head lru; + uint32_t count; + uint32_t max; +}; + +struct _ec_inode { + ec_lock_t *inode_lock; + gf_boolean_t have_info; + gf_boolean_t have_config; + gf_boolean_t have_version; + gf_boolean_t have_size; + int32_t heal_count; + ec_config_t config; + uint64_t pre_version[2]; + uint64_t post_version[2]; + uint64_t pre_size; + uint64_t post_size; + uint64_t dirty[2]; + struct list_head heal; + ec_stripe_list_t stripe_cache; + uint64_t bad_version; +}; + +typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t, + int32_t, uintptr_t, uintptr_t, uintptr_t, + uint32_t, dict_t *); +typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t, + int32_t, uintptr_t, uintptr_t, uintptr_t, + uint32_t, dict_t *); + +union _ec_cbk { + fop_access_cbk_t access; + fop_create_cbk_t create; + fop_discard_cbk_t discard; + fop_entrylk_cbk_t entrylk; + fop_fentrylk_cbk_t fentrylk; + fop_fallocate_cbk_t fallocate; + fop_flush_cbk_t flush; + fop_fsync_cbk_t fsync; + fop_fsyncdir_cbk_t fsyncdir; + fop_getxattr_cbk_t getxattr; + fop_fgetxattr_cbk_t fgetxattr; + fop_heal_cbk_t heal; + fop_fheal_cbk_t fheal; + fop_inodelk_cbk_t inodelk; + fop_finodelk_cbk_t finodelk; + fop_link_cbk_t link; + fop_lk_cbk_t lk; + fop_lookup_cbk_t lookup; + fop_mkdir_cbk_t mkdir; + fop_mknod_cbk_t mknod; + fop_open_cbk_t open; + fop_opendir_cbk_t opendir; + fop_readdir_cbk_t readdir; + fop_readdirp_cbk_t readdirp; + fop_readlink_cbk_t readlink; + fop_readv_cbk_t readv; + fop_removexattr_cbk_t removexattr; + fop_fremovexattr_cbk_t fremovexattr; + fop_rename_cbk_t rename; + fop_rmdir_cbk_t rmdir; + fop_setattr_cbk_t setattr; + fop_fsetattr_cbk_t fsetattr; + fop_setxattr_cbk_t setxattr; + fop_fsetxattr_cbk_t fsetxattr; + fop_stat_cbk_t stat; + fop_fstat_cbk_t fstat; + fop_statfs_cbk_t statfs; + fop_symlink_cbk_t symlink; + fop_truncate_cbk_t truncate; + fop_ftruncate_cbk_t ftruncate; + fop_unlink_cbk_t unlink; + fop_writev_cbk_t writev; + fop_xattrop_cbk_t xattrop; + fop_fxattrop_cbk_t fxattrop; + fop_zerofill_cbk_t zerofill; + fop_seek_cbk_t seek; + fop_ipc_cbk_t ipc; +}; + +struct _ec_lock { + ec_inode_t *ctx; + gf_timer_t *timer; + + /* List of owners of this lock. All fops added to this list are running + * concurrently. */ + struct list_head owners; + + /* List of fops waiting to be an owner of the lock. Fops are added to this + * list when the current owner has an incompatible access (conflicting lock) + * or the lock is not acquired yet. */ + struct list_head waiting; + + /* List of fops that will wait until the next unlock/lock cycle. This + * happens when the currently acquired lock is decided to be released as + * soon as possible. In this case, all frozen fops will be continued only + * after the lock is reacquired. */ + struct list_head frozen; + + uintptr_t mask; + uintptr_t good_mask; + uintptr_t healing; + uint32_t refs_owners; /* Refs for fops owning the lock */ + uint32_t refs_pending; /* Refs assigned to fops being prepared */ + uint32_t waiting_flags; /*Track xattrop/dirty marking*/ + gf_boolean_t acquired; + gf_boolean_t contention; + gf_boolean_t unlock_now; + gf_boolean_t release; + gf_boolean_t query; + fd_t *fd; + loc_t loc; + union { + entrylk_type type; + struct gf_flock flock; + }; +}; + +struct _ec_lock_link { + ec_lock_t *lock; + ec_fop_data_t *fop; + struct list_head owner_list; + struct list_head wait_list; + gf_boolean_t update[2]; + gf_boolean_t dirty[2]; + gf_boolean_t optimistic_changelog; + loc_t *base; + uint64_t size; + uint32_t waiting_flags; + off_t fl_start; + off_t fl_end; +}; + +/* This structure keeps a range of fragment offsets affected by a fop. Since + * real file offsets can be difficult to handle correctly because of overflows, + * we use the 'scaled' offset, which corresponds to the offset of the fragment + * seen by the bricks, which is always smaller and cannot overflow. */ +struct _ec_fragment_range { + uint64_t first; /* Address of the first affected fragment as seen by the + bricks (offset on brick) */ + uint64_t last; /* Address of the first non affected fragment as seen by + the bricks (offset on brick) */ +}; + +/* EC xlator data structure to collect all the data required to perform + * the file operation.*/ +struct _ec_fop_data { + int32_t id; /* ID of the file operation */ + int32_t refs; + int32_t state; + uint32_t minimum; /* Minimum number of successful + operation required to conclude a + fop as successful */ + int32_t expected; + int32_t winds; + int32_t jobs; + int32_t error; + ec_fop_data_t *parent; + xlator_t *xl; /* points to EC xlator */ + call_frame_t *req_frame; /* frame of the calling xlator */ + call_frame_t *frame; /* frame used by this fop */ + struct list_head cbk_list; /* sorted list of groups of answers */ + struct list_head answer_list; /* list of answers */ + struct list_head pending_list; /* member of ec_t.pending_fops */ + ec_cbk_data_t *answer; /* accepted answer */ + int32_t lock_count; + int32_t locked; + gf_lock_t lock; + ec_lock_link_t locks[2]; + int32_t first_lock; + + uint32_t fop_flags; /* Flags passed by the caller. */ + uint32_t flags; /* Internal flags. */ + uint32_t first; + uintptr_t mask; + uintptr_t healing; /*Dispatch is done but call is successful only + if fop->minimum number of subvolumes succeed + which are not healing*/ + uintptr_t remaining; + uintptr_t received; /* Mask of responses */ + uintptr_t good; + + uid_t uid; + gid_t gid; + + ec_wind_f wind; /* Function to wind to */ + ec_handler_f handler; /* FOP manager function */ + ec_resume_f resume; + ec_cbk_t cbks; /* Callback function for this FOP */ + void *data; + ec_heal_t *heal; + struct list_head healer; + + uint64_t user_size; + uint32_t head; + + int32_t use_fd; /* Indicates whether this FOP uses FD or + not */ + + dict_t *xdata; + dict_t *dict; + int32_t int32; + uint32_t uint32; + uint64_t size; + off_t offset; + mode_t mode[2]; + entrylk_cmd entrylk_cmd; + entrylk_type entrylk_type; + gf_xattrop_flags_t xattrop_flags; + dev_t dev; + inode_t *inode; + fd_t *fd; /* FD of the file on which FOP is + being carried upon */ + struct iatt iatt; + char *str[2]; + loc_t loc[2]; /* Holds the location details for + the file */ + struct gf_flock flock; + struct iovec *vector; + struct iobref *buffers; + gf_seek_what_t seek; + ec_fragment_range_t frag_range; /* This will hold the range of stripes + affected by the fop. */ + char *errstr; /*String of fop name, path and gfid + to be used in gf_msg. */ +}; + +struct _ec_cbk_data { + struct list_head list; /* item in the sorted list of groups */ + struct list_head answer_list; /* item in the list of answers */ + ec_fop_data_t *fop; + ec_cbk_data_t *next; /* next answer in the same group */ + uint32_t idx; + int32_t op_ret; + int32_t op_errno; + int32_t count; + uintptr_t mask; + + dict_t *xdata; + dict_t *dict; + int32_t int32; + uintptr_t uintptr[3]; + uint64_t size; + uint64_t version[2]; + inode_t *inode; + fd_t *fd; + struct statvfs statvfs; + struct iatt iatt[5]; + struct gf_flock flock; + struct iovec *vector; + struct iobref *buffers; + char *str; + gf_dirent_t entries; + off_t offset; + gf_seek_what_t what; +}; + +enum _ec_gf_opcode { + EC_GF_OP_LOAD, + EC_GF_OP_STORE, + EC_GF_OP_COPY, + EC_GF_OP_XOR2, + EC_GF_OP_XOR3, + EC_GF_OP_XORM, + EC_GF_OP_END +}; + +struct _ec_gf_op { + ec_gf_opcode_t op; + uint32_t arg1; + uint32_t arg2; + uint32_t arg3; +}; + +struct _ec_gf_mul { + uint32_t regs; + uint32_t map[EC_GF_MAX_REGS]; + ec_gf_op_t *ops; +}; + +struct _ec_gf { + uint32_t bits; + uint32_t size; + uint32_t mod; + uint32_t min_ops; + uint32_t max_ops; + uint32_t avg_ops; + uint32_t *log; + uint32_t *pow; + ec_gf_mul_t **table; +}; + +struct _ec_code_gen { + char *name; + char **flags; + uint32_t width; + + void (*prolog)(ec_code_builder_t *builder); + void (*epilog)(ec_code_builder_t *builder); + void (*load)(ec_code_builder_t *builder, uint32_t reg, uint32_t offset, + uint32_t bit); + void (*store)(ec_code_builder_t *builder, uint32_t reg, uint32_t bit); + void (*copy)(ec_code_builder_t *builder, uint32_t dst, uint32_t src); + void (*xor2)(ec_code_builder_t *builder, uint32_t dst, uint32_t src); + void (*xor3)(ec_code_builder_t *builder, uint32_t dst, uint32_t src1, + uint32_t src2); + void (*xorm)(ec_code_builder_t *builder, uint32_t dst, uint32_t offset, + uint32_t bit); +}; + +struct _ec_code { + gf_lock_t lock; + struct list_head spaces; + ec_gf_t *gf; + ec_code_gen_t *gen; +}; + +struct _ec_code_arg { + uint32_t value; +}; + +struct _ec_code_op { + ec_gf_opcode_t op; + ec_code_arg_t arg1; + ec_code_arg_t arg2; + ec_code_arg_t arg3; +}; + +struct _ec_code_builder { + ec_code_t *code; + uint64_t address; + uint8_t *data; + uint32_t size; + int32_t error; + uint32_t regs; + uint32_t bits; + uint32_t width; + uint32_t count; + uint32_t base; + uint32_t map[EC_GF_MAX_REGS]; + gf_boolean_t linear; + uint64_t loop; + ec_code_op_t ops[0]; +}; + +struct _ec_code_chunk { + struct list_head list; + size_t size; + ec_code_space_t *space; +}; + +struct _ec_code_space { + struct list_head list; + struct list_head chunks; + ec_code_t *code; + void *exec; + size_t size; +}; + +union _ec_code_func { + ec_code_func_linear_t linear; + ec_code_func_interleaved_t interleaved; +}; + +struct _ec_matrix_row { + ec_code_func_t func; + uint32_t *values; +}; + +struct _ec_matrix { + struct list_head lru; + uint32_t refs; + uint32_t columns; + uint32_t rows; + uintptr_t mask; + ec_code_t *code; + uint32_t *values; + ec_matrix_row_t row_data[0]; +}; + +struct _ec_matrix_list { + struct list_head lru; + gf_lock_t lock; + uint32_t columns; + uint32_t rows; + uint32_t max; + uint32_t count; + uint32_t stripe; + struct mem_pool *pool; + ec_gf_t *gf; + ec_code_t *code; + ec_matrix_t *encode; + ec_matrix_t **objects; +}; + +struct _ec_heal { + struct list_head list; + gf_lock_t lock; + xlator_t *xl; + ec_fop_data_t *fop; + void *data; + ec_fop_data_t *lookup; + loc_t loc; + struct iatt iatt; + char *symlink; + fd_t *fd; + int32_t partial; + int32_t done; + int32_t error; + gf_boolean_t nameheal; + uintptr_t available; + uintptr_t good; + uintptr_t bad; + uintptr_t open; + uintptr_t fixed; + uint64_t offset; + uint64_t size; + uint64_t total_size; + uint64_t version[2]; + uint64_t raw_size; +}; + +struct subvol_healer { + xlator_t *this; + int subvol; + gf_boolean_t running; + gf_boolean_t rerun; + pthread_mutex_t mutex; + pthread_cond_t cond; + pthread_t thread; +}; + +struct _ec_self_heald { + gf_boolean_t iamshd; + gf_boolean_t enabled; + int timeout; + uint32_t max_threads; + uint32_t wait_qlength; + struct subvol_healer *index_healers; + struct subvol_healer *full_healers; +}; + +struct _ec_statistics { + struct { + gf_atomic_t hits; /* Cache hits. */ + gf_atomic_t misses; /* Cache misses. */ + gf_atomic_t updates; /* Number of times an existing stripe has + been updated with new content. */ + gf_atomic_t invals; /* Number of times an existing stripe has + been invalidated because of truncates + or discards. */ + gf_atomic_t evicts; /* Number of times that an existing entry + has been evicted to make room for newer + entries. */ + gf_atomic_t allocs; /* Number of memory allocations made to + store stripes. */ + gf_atomic_t errors; /* Number of errors that have caused extra + requests. (Basically memory allocation + errors). */ + } stripe_cache; + struct { + gf_atomic_t attempted; /*Number of heals attempted on + files/directories*/ + gf_atomic_t completed; /*Number of heals complted on files/directories*/ + } shd; +}; + +struct _ec { + xlator_t *xl; + int32_t healers; + int32_t heal_waiters; + int32_t nodes; /* Total number of bricks(n) */ + int32_t bits_for_nodes; + int32_t fragments; /* Data bricks(k) */ + int32_t redundancy; /* Redundant bricks(m) */ + uint32_t fragment_size; /* Size of fragment/chunk on a + brick. */ + uint32_t stripe_size; /* (fragment_size * fragments) + maximum size of user data + stored in one stripe. */ + int32_t up; /* Represents whether EC volume is + up or not. */ + uint32_t idx; + uint32_t xl_up_count; /* Number of UP bricks. */ + uintptr_t xl_up; /* Bit flag representing UP + bricks */ + uint32_t xl_notify_count; /* Number of notifications. */ + uintptr_t xl_notify; /* Bit flag representing + notification for bricks. */ + uintptr_t node_mask; + uintptr_t read_mask; /*Stores user defined read-mask*/ + gf_atomic_t async_fop_count; /* Number of on going asynchronous fops. */ + xlator_t **xl_list; + gf_lock_t lock; + gf_timer_t *timer; + gf_boolean_t shutdown; + gf_boolean_t eager_lock; + gf_boolean_t other_eager_lock; + gf_boolean_t optimistic_changelog; + gf_boolean_t parallel_writes; + uint32_t stripe_cache; + uint32_t quorum_count; + uint32_t background_heals; + uint32_t heal_wait_qlen; + uint32_t self_heal_window_size; /* max size of read/writes */ + uint32_t eager_lock_timeout; + uint32_t other_eager_lock_timeout; + struct list_head pending_fops; + struct list_head heal_waiting; + struct list_head healing; + struct mem_pool *fop_pool; + struct mem_pool *cbk_pool; + struct mem_pool *lock_pool; + ec_self_heald_t shd; + char vol_uuid[UUID_SIZE + 1]; + dict_t *leaf_to_subvolid; + ec_read_policy_t read_policy; + ec_matrix_list_t matrix; + ec_statistics_t stats; +}; + +#endif /* __EC_TYPES_H__ */ diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c new file mode 100644 index 00000000000..7344be4968d --- /dev/null +++ b/xlators/cluster/ec/src/ec.c @@ -0,0 +1,1873 @@ +/* + Copyright (c) 2012-2015 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <glusterfs/defaults.h> +#include <glusterfs/statedump.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/upcall-utils.h> + +#include "ec.h" +#include "ec-messages.h" +#include "ec-mem-types.h" +#include "ec-types.h" +#include "ec-helpers.h" +#include "ec-common.h" +#include "ec-fops.h" +#include "ec-method.h" +#include "ec-code.h" +#include "ec-heald.h" +#include <glusterfs/events.h> + +static char *ec_read_policies[EC_READ_POLICY_MAX + 1] = { + [EC_ROUND_ROBIN] = "round-robin", + [EC_GFID_HASH] = "gfid-hash", + [EC_READ_POLICY_MAX] = NULL}; + +#define EC_INTERNAL_XATTR_OR_GOTO(name, xattr, op_errno, label) \ + do { \ + if (ec_is_internal_xattr(NULL, (char *)name, NULL, NULL)) { \ + op_errno = EPERM; \ + goto label; \ + } \ + if (name && (strlen(name) == 0) && xattr) { \ + /* Bulk [f]removexattr/[f]setxattr */ \ + GF_IF_INTERNAL_XATTR_GOTO(EC_XATTR_PREFIX "*", xattr, op_errno, \ + label); \ + } \ + } while (0) + +int32_t +ec_parse_options(xlator_t *this) +{ + ec_t *ec = this->private; + int32_t error = EINVAL; + uintptr_t mask; + + GF_OPTION_INIT("redundancy", ec->redundancy, int32, out); + ec->fragments = ec->nodes - ec->redundancy; + if ((ec->redundancy < 1) || (ec->redundancy >= ec->fragments) || + (ec->fragments > EC_MAX_FRAGMENTS)) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_REDUNDANCY, + "Invalid redundancy (must be between " + "1 and %d)", + (ec->nodes - 1) / 2); + + goto out; + } + + ec->bits_for_nodes = 1; + mask = 2; + while (ec->nodes > mask) { + ec->bits_for_nodes++; + mask <<= 1; + } + ec->node_mask = (1ULL << ec->nodes) - 1ULL; + ec->fragment_size = EC_METHOD_CHUNK_SIZE; + ec->stripe_size = ec->fragment_size * ec->fragments; + + gf_msg_debug("ec", 0, + "Initialized with: nodes=%u, fragments=%u, " + "stripe_size=%u, node_mask=%" PRIxFAST32, + ec->nodes, ec->fragments, ec->stripe_size, ec->node_mask); + + error = 0; + +out: + return error; +} + +int32_t +ec_prepare_childs(xlator_t *this) +{ + ec_t *ec = this->private; + xlator_list_t *child = NULL; + int32_t count = 0; + + for (child = this->children; child != NULL; child = child->next) { + count++; + } + if (count > EC_MAX_NODES) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_TOO_MANY_SUBVOLS, + "Too many subvolumes"); + + return EINVAL; + } + ec->nodes = count; + + ec->xl_list = GF_CALLOC(count, sizeof(ec->xl_list[0]), ec_mt_xlator_t); + if (ec->xl_list == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Allocation of xlator list failed"); + + return ENOMEM; + } + ec->xl_up = 0; + ec->xl_up_count = 0; + + count = 0; + for (child = this->children; child != NULL; child = child->next) { + ec->xl_list[count++] = child->xlator; + } + + return 0; +} + +/* This function transforms the subvol to subvol-id*/ +static int +_subvol_to_subvolid(dict_t *this, char *key, data_t *value, void *data) +{ + ec_t *ec = data; + xlator_t *subvol = NULL; + int i = 0; + int ret = -1; + + subvol = data_to_ptr(value); + for (i = 0; i < ec->nodes; i++) { + if (ec->xl_list[i] == subvol) { + ret = dict_set_int32(this, key, i); + /* -1 stops dict_foreach and returns -1*/ + if (ret < 0) + ret = -1; + goto out; + } + } +out: + return ret; +} + +int +ec_subvol_to_subvol_id_transform(ec_t *ec, dict_t *leaf_to_subvolid) +{ + return dict_foreach(leaf_to_subvolid, _subvol_to_subvolid, ec); +} + +void +__ec_destroy_private(xlator_t *this) +{ + ec_t *ec = this->private; + + if (ec != NULL) { + LOCK(&ec->lock); + + if (ec->timer != NULL) { + gf_timer_call_cancel(this->ctx, ec->timer); + ec->timer = NULL; + } + + UNLOCK(&ec->lock); + + /* There is a race with timer because there is no way to know if + * timer callback has really been cancelled or it has been scheduled + * for execution. If it has been scheduled, it will crash if we + * destroy ec too fast. + * + * Not sure how this can be solved without using global variables or + * having support from gf_timer_call_cancel() + */ + sleep(2); + + this->private = NULL; + if (ec->xl_list != NULL) { + GF_FREE(ec->xl_list); + ec->xl_list = NULL; + } + + if (ec->fop_pool != NULL) { + mem_pool_destroy(ec->fop_pool); + } + + if (ec->cbk_pool != NULL) { + mem_pool_destroy(ec->cbk_pool); + } + + if (ec->lock_pool != NULL) { + mem_pool_destroy(ec->lock_pool); + } + + LOCK_DESTROY(&ec->lock); + + if (ec->leaf_to_subvolid) + dict_unref(ec->leaf_to_subvolid); + + ec_method_fini(&ec->matrix); + + GF_FREE(ec); + } +} + +int32_t +mem_acct_init(xlator_t *this) +{ + if (xlator_mem_acct_init(this, ec_mt_end + 1) != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Memory accounting initialization " + "failed."); + + return -1; + } + + return 0; +} + +void +ec_configure_background_heal_opts(ec_t *ec, int background_heals, + int heal_wait_qlen) +{ + if (background_heals == 0) { + ec->heal_wait_qlen = 0; + } else { + ec->heal_wait_qlen = heal_wait_qlen; + } + ec->background_heals = background_heals; +} + +int +ec_assign_read_policy(ec_t *ec, char *read_policy) +{ + int read_policy_idx = -1; + + read_policy_idx = gf_get_index_by_elem(ec_read_policies, read_policy); + if (read_policy_idx < 0 || read_policy_idx >= EC_READ_POLICY_MAX) + return -1; + + ec->read_policy = read_policy_idx; + return 0; +} + +int32_t +reconfigure(xlator_t *this, dict_t *options) +{ + ec_t *ec = this->private; + char *read_policy = NULL; + char *extensions = NULL; + uint32_t heal_wait_qlen = 0; + uint32_t background_heals = 0; + int32_t ret = -1; + int32_t err; + + GF_OPTION_RECONF("cpu-extensions", extensions, options, str, failed); + + GF_OPTION_RECONF("self-heal-daemon", ec->shd.enabled, options, bool, + failed); + GF_OPTION_RECONF("iam-self-heal-daemon", ec->shd.iamshd, options, bool, + failed); + GF_OPTION_RECONF("eager-lock", ec->eager_lock, options, bool, failed); + GF_OPTION_RECONF("other-eager-lock", ec->other_eager_lock, options, bool, + failed); + GF_OPTION_RECONF("eager-lock-timeout", ec->eager_lock_timeout, options, + uint32, failed); + GF_OPTION_RECONF("other-eager-lock-timeout", ec->other_eager_lock_timeout, + options, uint32, failed); + GF_OPTION_RECONF("background-heals", background_heals, options, uint32, + failed); + GF_OPTION_RECONF("heal-wait-qlength", heal_wait_qlen, options, uint32, + failed); + GF_OPTION_RECONF("self-heal-window-size", ec->self_heal_window_size, + options, uint32, failed); + GF_OPTION_RECONF("heal-timeout", ec->shd.timeout, options, int32, failed); + ec_configure_background_heal_opts(ec, background_heals, heal_wait_qlen); + GF_OPTION_RECONF("shd-max-threads", ec->shd.max_threads, options, uint32, + failed); + GF_OPTION_RECONF("shd-wait-qlength", ec->shd.wait_qlength, options, uint32, + failed); + + GF_OPTION_RECONF("read-policy", read_policy, options, str, failed); + + GF_OPTION_RECONF("optimistic-change-log", ec->optimistic_changelog, options, + bool, failed); + GF_OPTION_RECONF("parallel-writes", ec->parallel_writes, options, bool, + failed); + GF_OPTION_RECONF("stripe-cache", ec->stripe_cache, options, uint32, failed); + GF_OPTION_RECONF("quorum-count", ec->quorum_count, options, uint32, failed); + ret = 0; + if (ec_assign_read_policy(ec, read_policy)) { + ret = -1; + } + + err = ec_method_update(this, &ec->matrix, extensions); + if (err != 0) { + ret = -1; + } + +failed: + return ret; +} + +glusterfs_event_t +ec_get_event_from_state(ec_t *ec) +{ + int down_count = 0; + + if (ec->xl_up_count >= ec->fragments) { + /* If ec is up but some subvolumes are yet to notify, give + * grace time for other subvols to notify to prevent start of + * I/O which may result in self-heals */ + if (ec->xl_notify_count < ec->nodes) + return GF_EVENT_MAXVAL; + + return GF_EVENT_CHILD_UP; + } else { + down_count = ec->xl_notify_count - ec->xl_up_count; + if (down_count > ec->redundancy) + return GF_EVENT_CHILD_DOWN; + } + + return GF_EVENT_MAXVAL; +} + +void +ec_up(xlator_t *this, ec_t *ec) +{ + char str1[32], str2[32]; + + if (ec->timer != NULL) { + gf_timer_call_cancel(this->ctx, ec->timer); + ec->timer = NULL; + } + + ec->up = 1; + gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, + "Going UP : Child UP = %s Child Notify = %s", + ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), + ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes)); + + gf_event(EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name); +} + +void +ec_down(xlator_t *this, ec_t *ec) +{ + char str1[32], str2[32]; + + if (ec->timer != NULL) { + gf_timer_call_cancel(this->ctx, ec->timer); + ec->timer = NULL; + } + + ec->up = 0; + gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, + "Going DOWN : Child UP = %s Child Notify = %s", + ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), + ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes)); + + gf_event(EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name); +} + +void +ec_notify_cbk(void *data) +{ + ec_t *ec = data; + glusterfs_event_t event = GF_EVENT_MAXVAL; + gf_boolean_t propagate = _gf_false; + gf_boolean_t launch_heal = _gf_false; + + LOCK(&ec->lock); + { + if (!ec->timer) { + /* + * Either child_up/child_down is already sent to parent + * This is a spurious wake up. + */ + goto unlock; + } + + gf_timer_call_cancel(ec->xl->ctx, ec->timer); + ec->timer = NULL; + + /* The timeout has expired, so any subvolume that has not + * already reported its state, will be considered to be down. + * We mark as if all bricks had reported. */ + ec->xl_notify = (1ULL << ec->nodes) - 1ULL; + ec->xl_notify_count = ec->nodes; + + /* Since we have marked all subvolumes as notified, it's + * guaranteed that ec_get_event_from_state() will return + * CHILD_UP or CHILD_DOWN, but not MAXVAL. */ + event = ec_get_event_from_state(ec); + if (event == GF_EVENT_CHILD_UP) { + /* We are ready to bring the volume up. If there are + * still bricks DOWN, they will be healed when they + * come up. */ + ec_up(ec->xl, ec); + + if (ec->shd.iamshd && !ec->shutdown) { + launch_heal = _gf_true; + GF_ATOMIC_INC(ec->async_fop_count); + } + } + + propagate = _gf_true; + } +unlock: + UNLOCK(&ec->lock); + + if (launch_heal) { + /* We have just brought the volume UP, so we trigger + * a self-heal check on the root directory. */ + ec_launch_replace_heal(ec); + } + if (propagate) { + default_notify(ec->xl, event, NULL); + } +} + +void +ec_launch_notify_timer(xlator_t *this, ec_t *ec) +{ + struct timespec delay = { + 0, + }; + + gf_msg_debug(this->name, 0, "Initiating child-down timer"); + delay.tv_sec = 10; + delay.tv_nsec = 0; + ec->timer = gf_timer_call_after(this->ctx, delay, ec_notify_cbk, ec); + if (ec->timer == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_TIMER_CREATE_FAIL, + "Cannot create timer " + "for delayed initialization"); + } +} + +gf_boolean_t +ec_disable_delays(ec_t *ec) +{ + ec->shutdown = _gf_true; + + return __ec_is_last_fop(ec); +} + +void +ec_cleanup_healer_object(ec_t *ec) +{ + struct subvol_healer *healer = NULL; + ec_self_heald_t *shd = NULL; + void *res = NULL; + int i = 0; + gf_boolean_t is_join = _gf_false; + + shd = &ec->shd; + if (!shd->iamshd) + return; + + for (i = 0; i < ec->nodes; i++) { + healer = &shd->index_healers[i]; + pthread_mutex_lock(&healer->mutex); + { + healer->rerun = 1; + if (healer->running) { + pthread_cond_signal(&healer->cond); + is_join = _gf_true; + } + } + pthread_mutex_unlock(&healer->mutex); + if (is_join) { + pthread_join(healer->thread, &res); + is_join = _gf_false; + } + + healer = &shd->full_healers[i]; + pthread_mutex_lock(&healer->mutex); + { + healer->rerun = 1; + if (healer->running) { + pthread_cond_signal(&healer->cond); + is_join = _gf_true; + } + } + pthread_mutex_unlock(&healer->mutex); + if (is_join) { + pthread_join(healer->thread, &res); + is_join = _gf_false; + } + } +} +void +ec_pending_fops_completed(ec_t *ec) +{ + if (ec->shutdown) { + default_notify(ec->xl, GF_EVENT_PARENT_DOWN, NULL); + } +} + +static gf_boolean_t +ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state) +{ + uintptr_t current_state = 0; + + if (xlator_is_cleanup_starting(ec->xl)) + return _gf_false; + + if ((ec->xl_notify & index_mask) == 0) { + ec->xl_notify |= index_mask; + ec->xl_notify_count++; + } + current_state = ec->xl_up & index_mask; + if (current_state != new_state) { + ec->xl_up ^= index_mask; + ec->xl_up_count += (current_state ? -1 : 1); + + return _gf_true; + } + + return _gf_false; +} + +static gf_boolean_t +ec_upcall(ec_t *ec, struct gf_upcall *upcall) +{ + struct gf_upcall_cache_invalidation *ci = NULL; + struct gf_upcall_inodelk_contention *lc = NULL; + inode_t *inode; + inode_table_t *table; + + switch (upcall->event_type) { + case GF_UPCALL_CACHE_INVALIDATION: + ci = upcall->data; + ci->flags |= UP_INVAL_ATTR; + return _gf_true; + + case GF_UPCALL_INODELK_CONTENTION: + lc = upcall->data; + if (strcmp(lc->domain, ec->xl->name) != 0) { + /* The lock is not owned by EC, ignore it. */ + return _gf_true; + } + table = ((xlator_t *)ec->xl->graph->top)->itable; + if (table == NULL) { + /* Self-heal daemon doesn't have an inode table on the top + * xlator because it doesn't need it. In this case we should + * use the inode table managed by EC itself where all inodes + * being healed should be present. However self-heal doesn't + * use eager-locking and inodelk's are already released as + * soon as possible. In this case we can safely ignore these + * notifications. */ + return _gf_false; + } + inode = inode_find(table, upcall->gfid); + /* If inode is not found, it means that it's already released, + * so we can ignore it. Probably it has been released and + * destroyed while the contention notification was being sent. + */ + if (inode != NULL) { + ec_lock_release(ec, inode); + inode_unref(inode); + } + + return _gf_false; + + default: + return _gf_true; + } +} + +int32_t +ec_notify(xlator_t *this, int32_t event, void *data, void *data2) +{ + ec_t *ec = this->private; + int32_t idx = 0; + int32_t error = 0; + glusterfs_event_t old_event = GF_EVENT_MAXVAL; + dict_t *input = NULL; + dict_t *output = NULL; + gf_boolean_t propagate = _gf_true; + gf_boolean_t needs_shd_check = _gf_false; + int32_t orig_event = event; + uintptr_t mask = 0; + + gf_msg_trace(this->name, 0, "NOTIFY(%d): %p, %p", event, data, data2); + + if (event == GF_EVENT_UPCALL) { + propagate = ec_upcall(ec, data); + goto done; + } + + if (event == GF_EVENT_TRANSLATOR_OP) { + if (!ec->up) { + error = -1; + } else { + input = data; + output = data2; + error = ec_xl_op(this, input, output); + } + goto out; + } + + for (idx = 0; idx < ec->nodes; idx++) { + if (ec->xl_list[idx] == data) { + break; + } + } + + LOCK(&ec->lock); + + if (event == GF_EVENT_PARENT_UP) { + /* + * Start a timer which sends appropriate event to parent + * xlator to prevent the 'mount' syscall from hanging. + */ + ec_launch_notify_timer(this, ec); + goto unlock; + } else if (event == GF_EVENT_PARENT_DOWN) { + /* If there aren't pending fops running after we have waken up + * them, we immediately propagate the notification. */ + propagate = ec_disable_delays(ec); + ec_cleanup_healer_object(ec); + goto unlock; + } + + if (idx < ec->nodes) { /* CHILD_* events */ + old_event = ec_get_event_from_state(ec); + + mask = 1ULL << idx; + if (event == GF_EVENT_CHILD_UP) { + /* We need to trigger a selfheal if a brick changes + * to UP state. */ + if (ec_set_up_state(ec, mask, mask) && ec->shd.iamshd && + !ec->shutdown) { + needs_shd_check = _gf_true; + } + } else if (event == GF_EVENT_CHILD_DOWN) { + ec_set_up_state(ec, mask, 0); + } + + event = ec_get_event_from_state(ec); + + if (event == GF_EVENT_CHILD_UP) { + if (!ec->up) { + ec_up(this, ec); + } + } else { + /* If the volume is not UP, it's irrelevant if one + * brick has come up. We cannot heal anything. */ + needs_shd_check = _gf_false; + + if ((event == GF_EVENT_CHILD_DOWN) && ec->up) { + ec_down(this, ec); + } + } + + if (event != GF_EVENT_MAXVAL) { + if (event == old_event) { + if (orig_event == GF_EVENT_CHILD_UP) + event = GF_EVENT_SOME_DESCENDENT_UP; + else /* orig_event has to be GF_EVENT_CHILD_DOWN */ + event = GF_EVENT_SOME_DESCENDENT_DOWN; + } + } else { + propagate = _gf_false; + needs_shd_check = _gf_false; + } + + if (needs_shd_check) { + GF_ATOMIC_INC(ec->async_fop_count); + } + } +unlock: + UNLOCK(&ec->lock); + +done: + if (needs_shd_check) { + ec_launch_replace_heal(ec); + } + if (propagate) { + error = default_notify(this, event, data); + } + +out: + return error; +} + +int32_t +notify(xlator_t *this, int32_t event, void *data, ...) +{ + int ret = -1; + va_list ap; + void *data2 = NULL; + + va_start(ap, data); + data2 = va_arg(ap, dict_t *); + va_end(ap); + ret = ec_notify(this, event, data, data2); + + return ret; +} + +static void +ec_statistics_init(ec_t *ec) +{ + GF_ATOMIC_INIT(ec->stats.stripe_cache.hits, 0); + GF_ATOMIC_INIT(ec->stats.stripe_cache.misses, 0); + GF_ATOMIC_INIT(ec->stats.stripe_cache.updates, 0); + GF_ATOMIC_INIT(ec->stats.stripe_cache.invals, 0); + GF_ATOMIC_INIT(ec->stats.stripe_cache.evicts, 0); + GF_ATOMIC_INIT(ec->stats.stripe_cache.allocs, 0); + GF_ATOMIC_INIT(ec->stats.stripe_cache.errors, 0); + GF_ATOMIC_INIT(ec->stats.shd.attempted, 0); + GF_ATOMIC_INIT(ec->stats.shd.completed, 0); +} + +static int +ec_assign_read_mask(ec_t *ec, char *read_mask_str) +{ + char *mask = NULL; + char *maskptr = NULL; + char *saveptr = NULL; + char *id_str = NULL; + int id = 0; + int ret = 0; + uintptr_t read_mask = 0; + + if (!read_mask_str) { + ec->read_mask = 0; + ret = 0; + goto out; + } + + mask = gf_strdup(read_mask_str); + if (!mask) { + ret = -1; + goto out; + } + maskptr = mask; + + for (;;) { + id_str = strtok_r(maskptr, ":", &saveptr); + if (id_str == NULL) + break; + if (gf_string2int(id_str, &id)) { + gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL, + "In read-mask \"%s\" id %s is not a valid integer", + read_mask_str, id_str); + ret = -1; + goto out; + } + + if ((id < 0) || (id >= ec->nodes)) { + gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL, + "In read-mask \"%s\" id %d is not in range [0 - %d]", + read_mask_str, id, ec->nodes - 1); + ret = -1; + goto out; + } + read_mask |= (1UL << id); + maskptr = NULL; + } + + if (gf_bits_count(read_mask) < ec->fragments) { + gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL, + "read-mask \"%s\" should contain at least %d ids", read_mask_str, + ec->fragments); + ret = -1; + goto out; + } + ec->read_mask = read_mask; + ret = 0; +out: + GF_FREE(mask); + return ret; +} + +int32_t +init(xlator_t *this) +{ + ec_t *ec = NULL; + char *read_policy = NULL; + char *extensions = NULL; + int32_t err; + char *read_mask_str = NULL; + + if (this->parents == NULL) { + gf_msg(this->name, GF_LOG_WARNING, 0, EC_MSG_NO_PARENTS, + "Volume does not have parents."); + } + + ec = GF_MALLOC(sizeof(*ec), ec_mt_ec_t); + if (ec == NULL) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to allocate private memory."); + + return -1; + } + memset(ec, 0, sizeof(*ec)); + + this->private = ec; + + ec->xl = this; + LOCK_INIT(&ec->lock); + + GF_ATOMIC_INIT(ec->async_fop_count, 0); + INIT_LIST_HEAD(&ec->pending_fops); + INIT_LIST_HEAD(&ec->heal_waiting); + INIT_LIST_HEAD(&ec->healing); + + ec->fop_pool = mem_pool_new(ec_fop_data_t, 1024); + ec->cbk_pool = mem_pool_new(ec_cbk_data_t, 4096); + ec->lock_pool = mem_pool_new(ec_lock_t, 1024); + if ((ec->fop_pool == NULL) || (ec->cbk_pool == NULL) || + (ec->lock_pool == NULL)) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY, + "Failed to create memory pools."); + + goto failed; + } + + if (ec_prepare_childs(this) != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL, + "Failed to initialize xlator"); + + goto failed; + } + + if (ec_parse_options(this) != 0) { + gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_XLATOR_PARSE_OPT_FAIL, + "Failed to parse xlator options"); + + goto failed; + } + + GF_OPTION_INIT("cpu-extensions", extensions, str, failed); + + err = ec_method_init(this, &ec->matrix, ec->fragments, ec->nodes, + ec->nodes * 2, extensions); + if (err != 0) { + gf_msg(this->name, GF_LOG_ERROR, -err, EC_MSG_MATRIX_FAILED, + "Failed to initialize matrix management"); + + goto failed; + } + + GF_OPTION_INIT("self-heal-daemon", ec->shd.enabled, bool, failed); + GF_OPTION_INIT("iam-self-heal-daemon", ec->shd.iamshd, bool, failed); + GF_OPTION_INIT("eager-lock", ec->eager_lock, bool, failed); + GF_OPTION_INIT("other-eager-lock", ec->other_eager_lock, bool, failed); + GF_OPTION_INIT("eager-lock-timeout", ec->eager_lock_timeout, uint32, + failed); + GF_OPTION_INIT("other-eager-lock-timeout", ec->other_eager_lock_timeout, + uint32, failed); + GF_OPTION_INIT("background-heals", ec->background_heals, uint32, failed); + GF_OPTION_INIT("heal-wait-qlength", ec->heal_wait_qlen, uint32, failed); + GF_OPTION_INIT("self-heal-window-size", ec->self_heal_window_size, uint32, + failed); + ec_configure_background_heal_opts(ec, ec->background_heals, + ec->heal_wait_qlen); + GF_OPTION_INIT("read-policy", read_policy, str, failed); + if (ec_assign_read_policy(ec, read_policy)) + goto failed; + + GF_OPTION_INIT("heal-timeout", ec->shd.timeout, int32, failed); + GF_OPTION_INIT("shd-max-threads", ec->shd.max_threads, uint32, failed); + GF_OPTION_INIT("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed); + GF_OPTION_INIT("optimistic-change-log", ec->optimistic_changelog, bool, + failed); + GF_OPTION_INIT("parallel-writes", ec->parallel_writes, bool, failed); + GF_OPTION_INIT("stripe-cache", ec->stripe_cache, uint32, failed); + GF_OPTION_INIT("quorum-count", ec->quorum_count, uint32, failed); + GF_OPTION_INIT("ec-read-mask", read_mask_str, str, failed); + + if (ec_assign_read_mask(ec, read_mask_str)) + goto failed; + + this->itable = inode_table_new(EC_SHD_INODE_LRU_LIMIT, this); + if (!this->itable) + goto failed; + + if (ec->shd.iamshd) + ec_selfheal_daemon_init(this); + gf_msg_debug(this->name, 0, "Disperse translator initialized."); + + ec->leaf_to_subvolid = dict_new(); + if (!ec->leaf_to_subvolid) + goto failed; + if (glusterfs_reachable_leaves(this, ec->leaf_to_subvolid)) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_SUBVOL_BUILD_FAIL, + "Failed to build subvol " + "dictionary"); + goto failed; + } + + if (ec_subvol_to_subvol_id_transform(ec, ec->leaf_to_subvolid) < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_SUBVOL_ID_DICT_SET_FAIL, + "Failed to build subvol-id " + "dictionary"); + goto failed; + } + + ec_statistics_init(ec); + + return 0; + +failed: + __ec_destroy_private(this); + + return -1; +} + +void +fini(xlator_t *this) +{ + ec_selfheal_daemon_fini(this); + __ec_destroy_private(this); +} + +int32_t +ec_gf_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) +{ + ec_access(frame, this, -1, EC_MINIMUM_ONE, default_access_cbk, NULL, loc, + mask, xdata); + + return 0; +} + +int32_t +ec_gf_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + ec_create(frame, this, -1, EC_MINIMUM_MIN, default_create_cbk, NULL, loc, + flags, mode, umask, fd, xdata); + + return 0; +} + +int32_t +ec_gf_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + ec_discard(frame, this, -1, EC_MINIMUM_MIN, default_discard_cbk, NULL, fd, + offset, len, xdata); + + return 0; +} + +int32_t +ec_gf_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) +{ + uint32_t fop_flags = EC_MINIMUM_ALL; + + if (cmd == ENTRYLK_UNLOCK) + fop_flags = EC_MINIMUM_ONE; + ec_entrylk(frame, this, -1, fop_flags, default_entrylk_cbk, NULL, volume, + loc, basename, cmd, type, xdata); + + return 0; +} + +int32_t +ec_gf_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, + fd_t *fd, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) +{ + uint32_t fop_flags = EC_MINIMUM_ALL; + + if (cmd == ENTRYLK_UNLOCK) + fop_flags = EC_MINIMUM_ONE; + ec_fentrylk(frame, this, -1, fop_flags, default_fentrylk_cbk, NULL, volume, + fd, basename, cmd, type, xdata); + + return 0; +} + +int32_t +ec_gf_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + ec_fallocate(frame, this, -1, EC_MINIMUM_MIN, default_fallocate_cbk, NULL, + fd, mode, offset, len, xdata); + + return 0; +} + +int32_t +ec_gf_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + ec_flush(frame, this, -1, EC_MINIMUM_MIN, default_flush_cbk, NULL, fd, + xdata); + + return 0; +} + +int32_t +ec_gf_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + ec_fsync(frame, this, -1, EC_MINIMUM_MIN, default_fsync_cbk, NULL, fd, + datasync, xdata); + + return 0; +} + +int32_t +ec_gf_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + ec_fsyncdir(frame, this, -1, EC_MINIMUM_MIN, default_fsyncdir_cbk, NULL, fd, + datasync, xdata); + + return 0; +} + +int +ec_marker_populate_args(call_frame_t *frame, int type, int *gauge, + xlator_t **subvols) +{ + xlator_t *this = frame->this; + ec_t *ec = this->private; + + memcpy(subvols, ec->xl_list, sizeof(*subvols) * ec->nodes); + + if (type == MARKER_XTIME_TYPE) { + /*Don't error out on ENOENT/ENOTCONN */ + gauge[MCNT_NOTFOUND] = 0; + gauge[MCNT_ENOTCONN] = 0; + } + + return ec->nodes; +} + +int32_t +ec_handle_heal_commands(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + dict_t *dict_rsp = NULL; + int op_ret = -1; + int op_errno = ENOMEM; + + if (!name || strcmp(name, GF_HEAL_INFO)) + return -1; + + op_errno = -ec_get_heal_info(this, loc, &dict_rsp); + if (op_errno <= 0) { + op_errno = op_ret = 0; + } + + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict_rsp, NULL); + if (dict_rsp) + dict_unref(dict_rsp); + return 0; +} + +int32_t +ec_gf_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + int error = 0; + ec_t *ec = this->private; + int32_t fop_flags = EC_MINIMUM_ONE; + + if (name && strcmp(name, EC_XATTR_HEAL) != 0) { + EC_INTERNAL_XATTR_OR_GOTO(name, NULL, error, out); + } + + if (ec_handle_heal_commands(frame, this, loc, name, xdata) == 0) + return 0; + + if (cluster_handle_marker_getxattr(frame, loc, name, ec->vol_uuid, NULL, + ec_marker_populate_args) == 0) + return 0; + + if (name && ((fnmatch(GF_XATTR_STIME_PATTERN, name, 0) == 0) || + XATTR_IS_NODE_UUID(name) || XATTR_IS_NODE_UUID_LIST(name))) { + fop_flags = EC_MINIMUM_ALL; + } + + ec_getxattr(frame, this, -1, fop_flags, default_getxattr_cbk, NULL, loc, + name, xdata); + + return 0; +out: + error = ENODATA; + STACK_UNWIND_STRICT(getxattr, frame, -1, error, NULL, NULL); + return 0; +} + +int32_t +ec_gf_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + int error = 0; + + EC_INTERNAL_XATTR_OR_GOTO(name, NULL, error, out); + + ec_fgetxattr(frame, this, -1, EC_MINIMUM_ONE, default_fgetxattr_cbk, NULL, + fd, name, xdata); + return 0; +out: + error = ENODATA; + STACK_UNWIND_STRICT(fgetxattr, frame, -1, error, NULL, NULL); + return 0; +} + +int32_t +ec_gf_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata) +{ + int32_t fop_flags = EC_MINIMUM_ALL; + + if (flock->l_type == F_UNLCK) + fop_flags = EC_MINIMUM_ONE; + + ec_inodelk(frame, this, &frame->root->lk_owner, -1, fop_flags, + default_inodelk_cbk, NULL, volume, loc, cmd, flock, xdata); + + return 0; +} + +int32_t +ec_gf_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, + fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) +{ + int32_t fop_flags = EC_MINIMUM_ALL; + + if (flock->l_type == F_UNLCK) + fop_flags = EC_MINIMUM_ONE; + ec_finodelk(frame, this, &frame->root->lk_owner, -1, fop_flags, + default_finodelk_cbk, NULL, volume, fd, cmd, flock, xdata); + + return 0; +} + +int32_t +ec_gf_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + ec_link(frame, this, -1, EC_MINIMUM_MIN, default_link_cbk, NULL, oldloc, + newloc, xdata); + + return 0; +} + +int32_t +ec_gf_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) +{ + int32_t fop_flags = EC_MINIMUM_ALL; + + if (flock->l_type == F_UNLCK) + fop_flags = EC_MINIMUM_ONE; + ec_lk(frame, this, -1, fop_flags, default_lk_cbk, NULL, fd, cmd, flock, + xdata); + + return 0; +} + +int32_t +ec_gf_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + ec_lookup(frame, this, -1, EC_MINIMUM_MIN, default_lookup_cbk, NULL, loc, + xdata); + + return 0; +} + +int32_t +ec_gf_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + ec_mkdir(frame, this, -1, EC_MINIMUM_MIN, default_mkdir_cbk, NULL, loc, + mode, umask, xdata); + + return 0; +} + +int32_t +ec_gf_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + ec_mknod(frame, this, -1, EC_MINIMUM_MIN, default_mknod_cbk, NULL, loc, + mode, rdev, umask, xdata); + + return 0; +} + +int32_t +ec_gf_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + ec_open(frame, this, -1, EC_MINIMUM_MIN, default_open_cbk, NULL, loc, flags, + fd, xdata); + + return 0; +} + +int32_t +ec_gf_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + ec_opendir(frame, this, -1, EC_MINIMUM_MIN, default_opendir_cbk, NULL, loc, + fd, xdata); + + return 0; +} + +int32_t +ec_gf_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + ec_readdir(frame, this, -1, EC_MINIMUM_ONE, default_readdir_cbk, NULL, fd, + size, offset, xdata); + + return 0; +} + +int32_t +ec_gf_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + ec_readdirp(frame, this, -1, EC_MINIMUM_ONE, default_readdirp_cbk, NULL, fd, + size, offset, xdata); + + return 0; +} + +int32_t +ec_gf_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) +{ + ec_readlink(frame, this, -1, EC_MINIMUM_ONE, default_readlink_cbk, NULL, + loc, size, xdata); + + return 0; +} + +int32_t +ec_gf_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + ec_readv(frame, this, -1, EC_MINIMUM_MIN, default_readv_cbk, NULL, fd, size, + offset, flags, xdata); + + return 0; +} + +int32_t +ec_gf_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + int error = 0; + + EC_INTERNAL_XATTR_OR_GOTO(name, xdata, error, out); + + ec_removexattr(frame, this, -1, EC_MINIMUM_MIN, default_removexattr_cbk, + NULL, loc, name, xdata); + + return 0; +out: + STACK_UNWIND_STRICT(removexattr, frame, -1, error, NULL); + return 0; +} + +int32_t +ec_gf_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + int error = 0; + + EC_INTERNAL_XATTR_OR_GOTO(name, xdata, error, out); + + ec_fremovexattr(frame, this, -1, EC_MINIMUM_MIN, default_fremovexattr_cbk, + NULL, fd, name, xdata); + + return 0; +out: + STACK_UNWIND_STRICT(fremovexattr, frame, -1, error, NULL); + return 0; +} + +int32_t +ec_gf_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + ec_rename(frame, this, -1, EC_MINIMUM_MIN, default_rename_cbk, NULL, oldloc, + newloc, xdata); + + return 0; +} + +int32_t +ec_gf_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) +{ + ec_rmdir(frame, this, -1, EC_MINIMUM_MIN, default_rmdir_cbk, NULL, loc, + xflags, xdata); + + return 0; +} + +int32_t +ec_gf_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + ec_setattr(frame, this, -1, EC_MINIMUM_MIN, default_setattr_cbk, NULL, loc, + stbuf, valid, xdata); + + return 0; +} + +int32_t +ec_gf_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + ec_fsetattr(frame, this, -1, EC_MINIMUM_MIN, default_fsetattr_cbk, NULL, fd, + stbuf, valid, xdata); + + return 0; +} + +int32_t +ec_gf_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + int error = 0; + + EC_INTERNAL_XATTR_OR_GOTO("", dict, error, out); + + ec_setxattr(frame, this, -1, EC_MINIMUM_MIN, default_setxattr_cbk, NULL, + loc, dict, flags, xdata); + + return 0; +out: + STACK_UNWIND_STRICT(setxattr, frame, -1, error, NULL); + return 0; +} + +int32_t +ec_gf_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + int error = 0; + + EC_INTERNAL_XATTR_OR_GOTO("", dict, error, out); + + ec_fsetxattr(frame, this, -1, EC_MINIMUM_MIN, default_fsetxattr_cbk, NULL, + fd, dict, flags, xdata); + + return 0; +out: + STACK_UNWIND_STRICT(fsetxattr, frame, -1, error, NULL); + return 0; +} + +int32_t +ec_gf_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + ec_stat(frame, this, -1, EC_MINIMUM_MIN, default_stat_cbk, NULL, loc, + xdata); + + return 0; +} + +int32_t +ec_gf_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + ec_fstat(frame, this, -1, EC_MINIMUM_MIN, default_fstat_cbk, NULL, fd, + xdata); + + return 0; +} + +int32_t +ec_gf_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + ec_statfs(frame, this, -1, EC_MINIMUM_MIN, default_statfs_cbk, NULL, loc, + xdata); + + return 0; +} + +int32_t +ec_gf_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + ec_symlink(frame, this, -1, EC_MINIMUM_MIN, default_symlink_cbk, NULL, + linkname, loc, umask, xdata); + + return 0; +} + +int32_t +ec_gf_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + ec_truncate(frame, this, -1, EC_MINIMUM_MIN, default_truncate_cbk, NULL, + loc, offset, xdata); + + return 0; +} + +int32_t +ec_gf_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + ec_ftruncate(frame, this, -1, EC_MINIMUM_MIN, default_ftruncate_cbk, NULL, + fd, offset, xdata); + + return 0; +} + +int32_t +ec_gf_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) +{ + ec_unlink(frame, this, -1, EC_MINIMUM_MIN, default_unlink_cbk, NULL, loc, + xflags, xdata); + + return 0; +} + +int32_t +ec_gf_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + ec_writev(frame, this, -1, EC_MINIMUM_MIN, default_writev_cbk, NULL, fd, + vector, count, offset, flags, iobref, xdata); + + return 0; +} + +int32_t +ec_gf_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + ec_xattrop(frame, this, -1, EC_MINIMUM_MIN, default_xattrop_cbk, NULL, loc, + optype, xattr, xdata); + + return 0; +} + +int32_t +ec_gf_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + ec_fxattrop(frame, this, -1, EC_MINIMUM_MIN, default_fxattrop_cbk, NULL, fd, + optype, xattr, xdata); + + return 0; +} + +int32_t +ec_gf_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + default_zerofill_failure_cbk(frame, ENOTSUP); + + return 0; +} + +int32_t +ec_gf_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) +{ + ec_seek(frame, this, -1, EC_MINIMUM_ONE, default_seek_cbk, NULL, fd, offset, + what, xdata); + + return 0; +} + +int32_t +ec_gf_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) +{ + ec_ipc(frame, this, -1, EC_MINIMUM_MIN, default_ipc_cbk, NULL, op, xdata); + return 0; +} + +int32_t +ec_gf_forget(xlator_t *this, inode_t *inode) +{ + uint64_t value = 0; + ec_inode_t *ctx = NULL; + + if ((inode_ctx_del(inode, this, &value) == 0) && (value != 0)) { + ctx = (ec_inode_t *)(uintptr_t)value; + /* We can only forget an inode if it has been unlocked, so the stripe + * cache should also be empty. */ + GF_ASSERT(list_empty(&ctx->stripe_cache.lru)); + GF_FREE(ctx); + } + + return 0; +} + +void +ec_gf_release_fd(xlator_t *this, fd_t *fd) +{ + uint64_t value = 0; + ec_fd_t *ctx = NULL; + + if ((fd_ctx_del(fd, this, &value) == 0) && (value != 0)) { + ctx = (ec_fd_t *)(uintptr_t)value; + loc_wipe(&ctx->loc); + GF_FREE(ctx); + } +} + +int32_t +ec_gf_release(xlator_t *this, fd_t *fd) +{ + ec_gf_release_fd(this, fd); + + return 0; +} + +int32_t +ec_gf_releasedir(xlator_t *this, fd_t *fd) +{ + ec_gf_release_fd(this, fd); + + return 0; +} + +int32_t +ec_dump_private(xlator_t *this) +{ + ec_t *ec = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + char tmp[65]; + + GF_ASSERT(this); + + ec = this->private; + GF_ASSERT(ec); + + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); + gf_proc_dump_add_section("%s", key_prefix); + gf_proc_dump_write("up", "%u", ec->up); + gf_proc_dump_write("nodes", "%u", ec->nodes); + gf_proc_dump_write("redundancy", "%u", ec->redundancy); + gf_proc_dump_write("fragment_size", "%u", ec->fragment_size); + gf_proc_dump_write("stripe_size", "%u", ec->stripe_size); + gf_proc_dump_write("childs_up", "%u", ec->xl_up_count); + gf_proc_dump_write("childs_up_mask", "%s", + ec_bin(tmp, sizeof(tmp), ec->xl_up, ec->nodes)); + if (ec->read_mask) { + gf_proc_dump_write("read-mask", "%s", + ec_bin(tmp, sizeof(tmp), ec->read_mask, ec->nodes)); + } + gf_proc_dump_write("background-heals", "%d", ec->background_heals); + gf_proc_dump_write("heal-wait-qlength", "%d", ec->heal_wait_qlen); + gf_proc_dump_write("self-heal-window-size", "%" PRIu32, + ec->self_heal_window_size); + gf_proc_dump_write("healers", "%d", ec->healers); + gf_proc_dump_write("heal-waiters", "%d", ec->heal_waiters); + gf_proc_dump_write("read-policy", "%s", ec_read_policies[ec->read_policy]); + gf_proc_dump_write("parallel-writes", "%d", ec->parallel_writes); + gf_proc_dump_write("quorum-count", "%u", ec->quorum_count); + + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s.stats.stripe_cache", + this->type, this->name); + gf_proc_dump_add_section("%s", key_prefix); + + gf_proc_dump_write("hits", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(ec->stats.stripe_cache.hits)); + gf_proc_dump_write("misses", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(ec->stats.stripe_cache.misses)); + gf_proc_dump_write("updates", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(ec->stats.stripe_cache.updates)); + gf_proc_dump_write("invalidations", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(ec->stats.stripe_cache.invals)); + gf_proc_dump_write("evicts", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(ec->stats.stripe_cache.evicts)); + gf_proc_dump_write("allocations", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(ec->stats.stripe_cache.allocs)); + gf_proc_dump_write("errors", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(ec->stats.stripe_cache.errors)); + gf_proc_dump_write("heals-attempted", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(ec->stats.shd.attempted)); + gf_proc_dump_write("heals-completed", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(ec->stats.shd.completed)); + + return 0; +} + +struct xlator_fops fops = {.lookup = ec_gf_lookup, + .stat = ec_gf_stat, + .fstat = ec_gf_fstat, + .truncate = ec_gf_truncate, + .ftruncate = ec_gf_ftruncate, + .access = ec_gf_access, + .readlink = ec_gf_readlink, + .mknod = ec_gf_mknod, + .mkdir = ec_gf_mkdir, + .unlink = ec_gf_unlink, + .rmdir = ec_gf_rmdir, + .symlink = ec_gf_symlink, + .rename = ec_gf_rename, + .link = ec_gf_link, + .create = ec_gf_create, + .open = ec_gf_open, + .readv = ec_gf_readv, + .writev = ec_gf_writev, + .flush = ec_gf_flush, + .fsync = ec_gf_fsync, + .opendir = ec_gf_opendir, + .readdir = ec_gf_readdir, + .readdirp = ec_gf_readdirp, + .fsyncdir = ec_gf_fsyncdir, + .statfs = ec_gf_statfs, + .setxattr = ec_gf_setxattr, + .getxattr = ec_gf_getxattr, + .fsetxattr = ec_gf_fsetxattr, + .fgetxattr = ec_gf_fgetxattr, + .removexattr = ec_gf_removexattr, + .fremovexattr = ec_gf_fremovexattr, + .lk = ec_gf_lk, + .inodelk = ec_gf_inodelk, + .finodelk = ec_gf_finodelk, + .entrylk = ec_gf_entrylk, + .fentrylk = ec_gf_fentrylk, + .xattrop = ec_gf_xattrop, + .fxattrop = ec_gf_fxattrop, + .setattr = ec_gf_setattr, + .fsetattr = ec_gf_fsetattr, + .fallocate = ec_gf_fallocate, + .discard = ec_gf_discard, + .zerofill = ec_gf_zerofill, + .seek = ec_gf_seek, + .ipc = ec_gf_ipc}; + +struct xlator_cbks cbks = {.forget = ec_gf_forget, + .release = ec_gf_release, + .releasedir = ec_gf_releasedir}; + +struct xlator_dumpops dumpops = {.priv = ec_dump_private}; + +struct volume_options options[] = { + {.key = {"redundancy"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "{{ volume.redundancy }}", + .description = "Maximum number of bricks that can fail " + "simultaneously without losing data."}, + { + .key = {"self-heal-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .description = "self-heal daemon enable/disable", + .default_value = "enable", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"disperse"}, + }, + {.key = {"iam-self-heal-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option differentiates if the disperse " + "translator is running as part of self-heal-daemon " + "or not."}, + {.key = {"eager-lock"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .op_version = {GD_OP_VERSION_3_7_10}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = {"disperse"}, + .description = "Enable/Disable eager lock for regular files on a " + "disperse volume. If a fop takes a lock and completes " + "its operation, it waits for next 1 second before " + "releasing the lock, to see if the lock can be reused " + "for next fop from the same client. If ec finds any lock " + "contention within 1 second it releases the lock " + "immediately before time expires. This improves the " + "performance of file operations. However, as it takes " + "lock on first brick, for few operations like read, " + "discovery of lock contention might take long time and " + "can actually degrade the performance. If eager lock is " + "disabled, lock will be released as soon as fop " + "completes."}, + {.key = {"other-eager-lock"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .op_version = {GD_OP_VERSION_3_13_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = {"disperse"}, + .description = "It's equivalent to the eager-lock option but for non " + "regular files."}, + {.key = {"eager-lock-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 60, + .default_value = "1", + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = {"disperse", "locks", "timeout"}, + .description = "Maximum time (in seconds) that a lock on an inode is " + "kept held if no new operations on the inode are " + "received."}, + {.key = {"other-eager-lock-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 60, + .default_value = "1", + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = {"disperse", "locks", "timeout"}, + .description = "It's equivalent to eager-lock-timeout option but for " + "non regular files."}, + { + .key = {"background-heals"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, /*Disabling background heals*/ + .max = 256, + .default_value = "8", + .op_version = {GD_OP_VERSION_3_7_3}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = {"disperse"}, + .description = "This option can be used to control number of parallel" + " heals", + }, + { + .key = {"heal-wait-qlength"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = + 65536, /*Around 100MB as of now with sizeof(ec_fop_data_t) at 1800*/ + .default_value = "128", + .op_version = {GD_OP_VERSION_3_7_3}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = {"disperse"}, + .description = "This option can be used to control number of heals" + " that can wait", + }, + {.key = {"heal-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 60, + .max = INT_MAX, + .default_value = "600", + .op_version = {GD_OP_VERSION_3_7_3}, + .flags = OPT_FLAG_SETTABLE, + .tags = {"disperse"}, + .description = "time interval for checking the need to self-heal " + "in self-heal-daemon"}, + { + .key = {"read-policy"}, + .type = GF_OPTION_TYPE_STR, + .value = {"round-robin", "gfid-hash"}, + .default_value = "gfid-hash", + .op_version = {GD_OP_VERSION_3_7_6}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = {"disperse"}, + .description = + "inode-read fops happen only on 'k' number of bricks in" + " n=k+m disperse subvolume. 'round-robin' selects the read" + " subvolume using round-robin algo. 'gfid-hash' selects read" + " subvolume based on hash of the gfid of that file/directory.", + }, + {.key = {"shd-max-threads"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 64, + .default_value = "1", + .op_version = {GD_OP_VERSION_3_9_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"disperse"}, + .description = "Maximum number of parallel heals SHD can do per local " + "brick. This can substantially lower heal times, " + "but can also crush your bricks if you don't have " + "the storage hardware to support this."}, + {.key = {"shd-wait-qlength"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 65536, + .default_value = "1024", + .op_version = {GD_OP_VERSION_3_9_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"disperse"}, + .description = "This option can be used to control number of heals" + " that can wait in SHD per subvolume"}, + {.key = {"cpu-extensions"}, + .type = GF_OPTION_TYPE_STR, + .value = {"none", "auto", "x64", "sse", "avx"}, + .default_value = "auto", + .op_version = {GD_OP_VERSION_3_9_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = {"disperse"}, + .description = "force the cpu extensions to be used to accelerate the " + "galois field computations."}, + {.key = {"self-heal-window-size"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 1024, + .default_value = "1", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = {"disperse"}, + .description = "Maximum number blocks(128KB) per file for which " + "self-heal process would be applied simultaneously."}, + {.key = {"optimistic-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .op_version = {GD_OP_VERSION_3_10_1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT, + .tags = {"disperse"}, + .description = "Set/Unset dirty flag for every update fop at the start" + "of the fop. If OFF, this option impacts performance of" + "entry operations or metadata operations as it will" + "set dirty flag at the start and unset it at the end of" + "ALL update fop. If ON and all the bricks are good," + "dirty flag will be set at the start only for file fops" + "For metadata and entry fops dirty flag will not be set" + "at the start, if all the bricks are good. This does" + "not impact performance for metadata operations and" + "entry operation but has a very small window to miss" + "marking entry as dirty in case it is required to be" + "healed"}, + {.key = {"parallel-writes"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "This controls if writes can be wound in parallel as long" + "as it doesn't modify same stripes"}, + {.key = {"stripe-cache"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, /*Disabling stripe_cache*/ + .max = EC_STRIPE_CACHE_MAX_SIZE, + .default_value = "4", + .description = "This option will keep the last stripe of write fop" + "in memory. If next write falls in this stripe, we need" + "not to read it again from backend and we can save READ" + "fop going over the network. This will improve performance," + "specially for sequential writes. However, this will also" + "lead to extra memory consumption, maximum " + "(cache size * stripe size) Bytes per open file."}, + { + .key = {"quorum-count"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "0", + .description = + "This option can be used to define how many successes on" + "the bricks constitute a success to the application. This" + " count should be in the range" + "[disperse-data-count, disperse-count] (inclusive)", + }, + { + .key = {"ec-read-mask"}, + .type = GF_OPTION_TYPE_STR, + .default_value = NULL, + .description = "This option can be used to choose which bricks can be" + " used for reading data/metadata of a file/directory", + }, + { + .key = {NULL}, + }, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "disperse", + .category = GF_MAINTAINED, +}; diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h new file mode 100644 index 00000000000..6f6de6d5981 --- /dev/null +++ b/xlators/cluster/ec/src/ec.h @@ -0,0 +1,34 @@ +/* + Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_H__ +#define __EC_H__ + +#include "ec-method.h" + +#define EC_XATTR_PREFIX "trusted.ec." +#define EC_XATTR_CONFIG EC_XATTR_PREFIX "config" +#define EC_XATTR_SIZE EC_XATTR_PREFIX "size" +#define EC_XATTR_VERSION EC_XATTR_PREFIX "version" +#define EC_XATTR_HEAL EC_XATTR_PREFIX "heal" +#define EC_XATTR_HEAL_NEW EC_XATTR_PREFIX "heal-new" +#define EC_XATTR_DIRTY EC_XATTR_PREFIX "dirty" +#define EC_STRIPE_CACHE_MAX_SIZE 10 +#define EC_VERSION_SIZE 2 +#define EC_SHD_INODE_LRU_LIMIT 10 + +#define EC_MAX_FRAGMENTS EC_METHOD_MAX_FRAGMENTS +/* The maximum number of nodes is derived from the maximum allowed fragments + * using the rule that redundancy cannot be equal or greater than the number + * of fragments. + */ +#define EC_MAX_NODES min(EC_MAX_FRAGMENTS * 2 - 1, EC_METHOD_MAX_NODES) + +#endif /* __EC_H__ */ diff --git a/xlators/cluster/ha/src/Makefile.am b/xlators/cluster/ha/src/Makefile.am deleted file mode 100644 index 5f78a296533..00000000000 --- a/xlators/cluster/ha/src/Makefile.am +++ /dev/null @@ -1,15 +0,0 @@ -xlator_LTLIBRARIES = ha.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/cluster - -ha_la_LDFLAGS = -module -avoidversion - -ha_la_SOURCES = ha-helpers.c ha.c -ha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -noinst_HEADERS = ha.h - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) - -CLEANFILES = - diff --git a/xlators/cluster/ha/src/ha-helpers.c b/xlators/cluster/ha/src/ha-helpers.c deleted file mode 100644 index 7c361547af8..00000000000 --- a/xlators/cluster/ha/src/ha-helpers.c +++ /dev/null @@ -1,204 +0,0 @@ -/* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#include "xlator.h" -#include "call-stub.h" -#include "defaults.h" -#include "dict.h" -#include "compat-errno.h" -#include "ha.h" - -#define HA_TRANSPORT_NOTCONN(_ret, _errno, _fd) \ - ((_ret == -1) && (_fd ? (_errno == EBADFD):(_errno == ENOTCONN))) - -int ha_alloc_init_fd (call_frame_t *frame, fd_t *fd) -{ - ha_local_t *local = NULL; - int i = -1; - ha_private_t *pvt = NULL; - int child_count = 0; - int ret = -1; - hafd_t *hafdp = NULL; - xlator_t *this = NULL; - uint64_t tmp_hafdp = 0; - - this = frame->this; - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - - if (local == NULL) { - ret = fd_ctx_get (fd, this, &tmp_hafdp); - if (ret < 0) { - goto out; - } - hafdp = (hafd_t *)(long)tmp_hafdp; - local = frame->local = GF_CALLOC (1, sizeof (*local), - gf_ha_mt_ha_local_t); - if (local == NULL) { - ret = -ENOMEM; - goto out; - } - local->state = GF_CALLOC (1, child_count, - gf_ha_mt_child_count); - if (local->state == NULL) { - ret = -ENOMEM; - goto out; - } - - /* take care of the preferred subvolume */ - if (pvt->pref_subvol == -1) - local->active = hafdp->active; - else - local->active = pvt->pref_subvol; - - LOCK (&hafdp->lock); - memcpy (local->state, hafdp->fdstate, child_count); - UNLOCK (&hafdp->lock); - - /* in case the preferred subvolume is down */ - if ((local->active != -1) && (local->state[local->active] == 0)) - local->active = -1; - - for (i = 0; i < child_count; i++) { - if (local->state[i]) { - if (local->active == -1) - local->active = i; - local->tries++; - } - } - if (local->active == -1) { - ret = -ENOTCONN; - goto out; - } - local->fd = fd_ref (fd); - } - ret = 0; -out: - return ret; -} - -int ha_handle_cbk (call_frame_t *frame, void *cookie, int op_ret, int op_errno) -{ - xlator_t *xl = NULL; - ha_private_t *pvt = NULL; - xlator_t **children = NULL; - int prev_child = -1; - hafd_t *hafdp = NULL; - int ret = -1; - call_stub_t *stub = NULL; - ha_local_t *local = NULL; - uint64_t tmp_hafdp = 0; - - xl = frame->this; - pvt = xl->private; - children = pvt->children; - prev_child = (long) cookie; - local = frame->local; - - if (op_ret == -1) { - gf_log (xl->name, GF_LOG_ERROR ,"(child=%s) (op_ret=%d op_errno=%s)", - children[prev_child]->name, op_ret, strerror (op_errno)); - } - - if (HA_TRANSPORT_NOTCONN (op_ret, op_errno, (local->fd))) { - ret = 0; - if (local->fd) { - ret = fd_ctx_get (local->fd, xl, &tmp_hafdp); - } - hafdp = (hafd_t *)(long)tmp_hafdp; - if (ret == 0) { - if (local->fd) { - LOCK(&hafdp->lock); - hafdp->fdstate[prev_child] = 0; - UNLOCK(&hafdp->lock); - } - local->tries--; - if (local->tries != 0) { - while (1) { - local->active = (local->active + 1) % pvt->child_count; - if (local->state[local->active]) - break; - } - stub = local->stub; - local->stub = NULL; - call_resume (stub); - return -1; - } - } - } - if (local->stub) { - call_stub_destroy (local->stub); - local->stub = NULL; - } - - if (local->fd) { - GF_FREE (local->state); - local->state = NULL; - - fd_unref (local->fd); - local->fd = NULL; - } - return 0; -} - -int ha_alloc_init_inode (call_frame_t *frame, inode_t *inode) -{ - int i = -1; - ha_private_t *pvt = NULL; - xlator_t *xl = NULL; - int ret = -1; - ha_local_t *local = NULL; - uint64_t tmp_state = 0; - - xl = frame->this; - pvt = xl->private; - local = frame->local; - - if (local == NULL) { - local = frame->local = GF_CALLOC (1, sizeof (*local), - gf_ha_mt_ha_local_t); - if (local == NULL) { - ret = -ENOMEM; - goto out; - } - local->active = pvt->pref_subvol; - ret = inode_ctx_get (inode, xl, &tmp_state); - if (ret < 0) { - goto out; - } - local->state = (char *)(long)tmp_state; - if (local->active != -1 && local->state[local->active] == 0) - local->active = -1; - for (i = 0; i < pvt->child_count; i++) { - if (local->state[i]) { - if (local->active == -1) - local->active = i; - local->tries++; - } - } - if (local->active == -1) { - ret = -ENOTCONN; - goto out; - } - } - ret = 0; -out: - return ret; -} diff --git a/xlators/cluster/ha/src/ha-mem-types.h b/xlators/cluster/ha/src/ha-mem-types.h deleted file mode 100644 index b460588aa03..00000000000 --- a/xlators/cluster/ha/src/ha-mem-types.h +++ /dev/null @@ -1,37 +0,0 @@ - -/* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - - -#ifndef __HA_MEM_TYPES_H__ -#define __HA_MEM_TYPES_H__ - -#include "mem-types.h" - -enum gf_ha_mem_types_ { - gf_ha_mt_ha_local_t = gf_common_mt_end + 1, - gf_ha_mt_hafd_t, - gf_ha_mt_char, - gf_ha_mt_child_count, - gf_ha_mt_xlator_t, - gf_ha_mt_ha_private_t, - gf_ha_mt_end -}; -#endif - diff --git a/xlators/cluster/ha/src/ha.c b/xlators/cluster/ha/src/ha.c deleted file mode 100644 index 7bb1824a138..00000000000 --- a/xlators/cluster/ha/src/ha.c +++ /dev/null @@ -1,4023 +0,0 @@ -/* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -/* generate errors randomly, code is simple now, better alogorithm - * can be written to decide what error to be returned and when - */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "call-stub.h" -#include "defaults.h" -#include "dict.h" -#include "compat-errno.h" -#include "ha.h" - -/* - * TODO: - * - dbench fails if ha over server side afr - * - lock calls - lock on all subvols. - * - support preferred-subvolume option. code already there. - * - do not alloc the call-stub in case only one subvol is up. - */ - -void -ha_local_wipe (ha_local_t *local) -{ - if (local->stub) { - call_stub_destroy (local->stub); - local->stub = NULL; - } - - if (local->state) { - GF_FREE (local->state); - local->state = NULL; - } - - if (local->dict) { - dict_unref (local->dict); - local->dict = NULL; - } - - loc_wipe (&local->loc); - - if (local->fd) { - fd_unref (local->fd); - local->fd = NULL; - } - - if (local->inode) { - inode_unref (local->inode); - local->inode = NULL; - } - - GF_FREE (local); - return; -} - - -int -ha_forget (xlator_t *this, - inode_t *inode) -{ - uint64_t stateino = 0; - char *state = NULL; - if (!inode_ctx_del (inode, this, &stateino)) { - state = ((char *)(long)stateino); - GF_FREE (state); - } - - return 0; - -} - -int32_t -ha_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - int child_count = 0, i = 0, callcnt = 0; - char *state = NULL; - call_frame_t *prev_frame = NULL; - xlator_t **children = NULL; - uint64_t tmp_state = 0; - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - prev_frame = cookie; - children = pvt->children; - - for (i = 0; i < child_count; i++) { - if (pvt->children[i] == prev_frame->this) - break; - } - if ((op_ret == -1) && (op_errno != ENOENT)) { - gf_log (this->name, GF_LOG_ERROR, "(child=%s) (op_ret=%d op_errno=%s)", - children[i]->name, op_ret, strerror (op_errno)); - } - inode_ctx_get (local->inode, this, &tmp_state); - state = (char *)(long)tmp_state; - - LOCK (&frame->lock); - if (local->revalidate == 1) { - if ((!op_ret) != state[i]) { - local->revalidate_error = 1; - gf_log (this->name, GF_LOG_DEBUG, "revalidate error on %s", - pvt->children[i]->name); - } - } else { - if (op_ret == 0) { - state[i] = 1; - } - } - if (local->op_ret == -1 && op_ret == 0) { - local->op_ret = 0; - local->buf = *buf; - local->postparent = *postparent; - if (dict) - local->dict = dict_ref (dict); - } - if (op_ret == -1 && op_ret != ENOTCONN) - local->op_errno = op_errno; - callcnt = --local->call_count; - UNLOCK (&frame->lock); - - if (callcnt == 0) { - dict_t *ctx = local->dict; - inode_t *inode = local->inode; - if (local->revalidate_error == 1) { - local->op_ret = -1; - local->op_errno = EIO; - gf_log (this->name, GF_LOG_DEBUG, "revalidate error, returning EIO"); - } - STACK_UNWIND (frame, - local->op_ret, - local->op_errno, - inode, - &local->buf, - ctx, - &local->postparent); - if (inode) - inode_unref (inode); - if (ctx) - dict_unref (ctx); - } - return 0; -} - -int32_t -ha_lookup (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *xattr_req) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - int child_count = 0, i = 0; - char *state = NULL; - xlator_t **children = NULL; - int ret = -1; - int32_t op_errno = EINVAL; - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - children = pvt->children; - - frame->local = local = GF_CALLOC (1, sizeof (*local), - gf_ha_mt_ha_local_t); - if (!local) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - op_errno = ENOMEM; - goto unwind; - } - - child_count = pvt->child_count; - local->inode = inode_ref (loc->inode); - - ret = inode_ctx_get (loc->inode, this, NULL); - if (ret) { - state = GF_CALLOC (1, child_count, gf_ha_mt_child_count); - if (state == NULL) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - op_errno = ENOMEM; - goto unwind; - } - - inode_ctx_put (loc->inode, this, (uint64_t)(long)state); - } else - local->revalidate = 1; - - local->op_ret = -1; - local->op_errno = ENOTCONN; - local->call_count = child_count; - - for (i = 0; i < child_count; i++) { - STACK_WIND (frame, - ha_lookup_cbk, - children[i], - children[i]->fops->lookup, - loc, - xattr_req); - } - return 0; - -unwind: - local = frame->local; - frame->local = NULL; - STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL); - - ha_local_wipe (local); - return 0; -} - - int32_t -ha_stat_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno, - buf); - } - return 0; -} - -int32_t -ha_stat (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - ha_local_t *local = NULL; - int op_errno = ENOTCONN; - - op_errno = ha_alloc_init_inode (frame, loc->inode); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_stat_stub (frame, ha_stat, loc); - - STACK_WIND_COOKIE (frame, - ha_stat_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->stat, - loc); - return 0; -err: - STACK_UNWIND (frame, -1, op_errno, NULL); - return 0; -} - -int32_t -ha_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *statpre, - struct iatt *statpost) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, op_ret, op_errno, statpre, statpost); - } - return 0; -} - - -int32_t -ha_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, - int32_t valid) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_inode (frame, loc->inode); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_setattr_stub (frame, ha_setattr, loc, stbuf, valid); - - STACK_WIND_COOKIE (frame, - ha_setattr_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->setattr, - loc, stbuf, valid); - return 0; -err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int32_t -ha_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, - int32_t valid) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_fd (frame, fd); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_fsetattr_stub (frame, ha_fsetattr, fd, stbuf, valid); - - STACK_WIND_COOKIE (frame, - ha_setattr_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->fsetattr, - fd, stbuf, valid); - return 0; -err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int32_t -ha_truncate_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno, - prebuf, - postbuf); - } - return 0; -} - -int32_t -ha_truncate (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - off_t offset) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_inode (frame, loc->inode); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_truncate_stub (frame, ha_truncate, loc, offset); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_truncate_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->truncate, - loc, - offset); - return 0; -err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - return 0; -} - - int32_t -ha_ftruncate_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno, - prebuf, postbuf); - } - return 0; -} - -int32_t -ha_ftruncate (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - off_t offset) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_fd (frame, fd); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_ftruncate_stub (frame, ha_ftruncate, fd, offset); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_ftruncate_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->ftruncate, - fd, - offset); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - ha_local_wipe (local); - return 0; -} - -int32_t -ha_access_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno); - } - return 0; -} - -int32_t -ha_access (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t mask) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_inode (frame, loc->inode); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_access_stub (frame, ha_access, loc, mask); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_access_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->access, - loc, - mask); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno); - - ha_local_wipe (local); - return 0; -} - - - int32_t -ha_readlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - const char *path, - struct iatt *sbuf) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno, - path, - sbuf); - } - return 0; -} - -int32_t -ha_readlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - size_t size) -{ - ha_local_t *local = frame->local; - int op_errno = 0; - - op_errno = ha_alloc_init_inode (frame, loc->inode); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_readlink_stub (frame, ha_readlink, loc, size); - - STACK_WIND_COOKIE (frame, - ha_readlink_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->readlink, - loc, - size); - return 0; -err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - return 0; -} - -int -ha_mknod_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - char *stateino = NULL; - int child_count = 0, i = 0, cnt = 0, ret = 0; - call_frame_t *prev_frame = NULL; - xlator_t **children = NULL; - uint64_t tmp_stateino = 0; - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - prev_frame = cookie; - children = pvt->children; - - for (i = 0; i < child_count; i++) - if (prev_frame->this == children[i]) - break; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "(path=%s) (op_ret=%d op_errno=%d)", - local->stub->args.mknod.loc.path, op_ret, op_errno); - } - ret = inode_ctx_get (local->stub->args.mknod.loc.inode, - this, &tmp_stateino); - stateino = (char *)(long)tmp_stateino; - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, - "unwind(-1), inode_ctx_get() error"); - /* It is difficult to handle this error at this stage - * as we still expect more cbks, we can't return as - * of now - */ - } else if (op_ret == 0) { - stateino[i] = 1; - } - LOCK (&frame->lock); - cnt = --local->call_count; - UNLOCK (&frame->lock); - - if (cnt == 0) { - call_stub_t *stub = local->stub; - GF_FREE (local->state); - STACK_UNWIND (frame, - local->op_ret, - local->op_errno, - local->stub->args.mknod.loc.inode, - &local->buf, &local->preparent, - &local->postparent); - call_stub_destroy (stub); - } - return 0; -} - -int32_t -ha_mknod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - char *stateino = NULL; - int child_count = 0, i = 0, cnt = 0, ret = 0; - call_frame_t *prev_frame = NULL; - xlator_t **children = NULL; - uint64_t tmp_stateino = 0; - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - prev_frame = cookie; - children = pvt->children; - - for (i = 0; i < child_count; i++) - if (prev_frame->this == children[i]) - break; - - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mknod.loc.path, op_ret, op_errno); - } - - ret = inode_ctx_get (local->stub->args.mknod.loc.inode, - this, &tmp_stateino); - stateino = (char *)(long)tmp_stateino; - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "inode_ctx_get() error"); - /* FIXME: handle the case */ - } - if (op_ret == 0) { - stateino[i] = 1; - local->op_ret = 0; - local->first_success = 1; - local->buf = *buf; - local->preparent = *preparent; - local->postparent = *postparent; - } - cnt = --local->call_count; - for (i = local->active + 1; i < child_count; i++) { - if (local->state[i]) - break; - } - - if (cnt == 0 || i == child_count) { - call_stub_t *stub = local->stub; - GF_FREE (local->state); - stub = local->stub; - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->stub->args.mknod.loc.inode, &local->buf, - &local->preparent, &local->postparent); - call_stub_destroy (stub); - return 0; - } - - local->active = i; - - if (local->first_success == 0) { - STACK_WIND (frame, - ha_mknod_cbk, - children[i], - children[i]->fops->mknod, - &local->stub->args.mknod.loc, - local->stub->args.mknod.mode, - local->stub->args.mknod.rdev); - return 0; - } - cnt = local->call_count; - - for (; i < child_count; i++) { - if (local->state[i]) { - STACK_WIND (frame, - ha_mknod_lookup_cbk, - children[i], - children[i]->fops->lookup, - &local->stub->args.mknod.loc, - 0); - if (--cnt == 0) - break; - } - } - return 0; -} - -int32_t -ha_mknod (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode, - dev_t rdev) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - int child_count = 0, i = 0; - char *stateino = NULL; - int32_t op_errno = EINVAL; - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - - frame->local = local = GF_CALLOC (1, sizeof (*local), - gf_ha_mt_ha_local_t); - if (!local) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - op_errno = ENOMEM; - goto err; - } - - local->stub = fop_mknod_stub (frame, ha_mknod, loc, mode, rdev); - if (!local->stub) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - op_errno = ENOMEM; - goto err; - } - - local->op_ret = -1; - local->op_errno = ENOTCONN; - local->state = GF_CALLOC (1, child_count, gf_ha_mt_char); - if (!local->state) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - op_errno = ENOMEM; - goto err; - } - - memcpy (local->state, pvt->state, child_count); - local->active = -1; - - stateino = GF_CALLOC (1, child_count, gf_ha_mt_char); - if (!stateino) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - op_errno = ENOMEM; - goto err; - } - - inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino); - - for (i = 0; i < child_count; i++) { - if (local->state[i]) { - local->call_count++; - if (local->active == -1) - local->active = i; - } - } - - STACK_WIND (frame, - ha_mknod_cbk, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->mknod, - loc, mode, rdev); - return 0; - -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL); - ha_local_wipe (local); - return 0; -} - - -int -ha_mkdir_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - char *stateino = NULL; - int child_count = 0, i = 0, cnt = 0; - call_frame_t *prev_frame = NULL; - xlator_t **children = NULL; - uint64_t tmp_stateino = 0; - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - prev_frame = cookie; - children = pvt->children; - - for (i = 0; i < child_count; i++) - if (prev_frame->this == children[i]) - break; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mkdir.loc.path, op_ret, op_errno); - } - inode_ctx_get (local->stub->args.mkdir.loc.inode, - this, &tmp_stateino); - stateino = (char *)(long)tmp_stateino; - - if (op_ret == 0) - stateino[i] = 1; - - LOCK (&frame->lock); - cnt = --local->call_count; - UNLOCK (&frame->lock); - - if (cnt == 0) { - call_stub_t *stub = local->stub; - GF_FREE (local->state); - STACK_UNWIND (frame, - local->op_ret, - local->op_errno, - local->stub->args.mkdir.loc.inode, &local->buf, - &local->preparent, &local->postparent); - call_stub_destroy (stub); - } - return 0; -} - -int32_t -ha_mkdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - char *stateino = NULL; - int child_count = 0, i = 0, cnt = 0; - call_frame_t *prev_frame = NULL; - xlator_t **children = NULL; - uint64_t tmp_stateino = 0; - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - prev_frame = cookie; - children = pvt->children; - - for (i = 0; i < child_count; i++) - if (prev_frame->this == children[i]) - break; - - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mkdir.loc.path, op_ret, op_errno); - } - - inode_ctx_get (local->stub->args.mkdir.loc.inode, - this, &tmp_stateino); - stateino = (char *)(long)tmp_stateino; - - if (op_ret == 0) { - stateino[i] = 1; - local->op_ret = 0; - local->first_success = 1; - local->buf = *buf; - local->preparent = *preparent; - local->postparent = *postparent; - } - cnt = --local->call_count; - for (i = local->active + 1; i < child_count; i++) { - if (local->state[i]) - break; - } - - if (cnt == 0 || i == child_count) { - call_stub_t *stub = local->stub; - GF_FREE (local->state); - stub = local->stub; - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->stub->args.mkdir.loc.inode, &local->buf, - &local->preparent, &local->postparent); - call_stub_destroy (stub); - return 0; - } - - local->active = i; - - if (local->first_success == 0) { - STACK_WIND (frame, - ha_mkdir_cbk, - children[i], - children[i]->fops->mkdir, - &local->stub->args.mkdir.loc, - local->stub->args.mkdir.mode); - return 0; - } - cnt = local->call_count; - - for (; i < child_count; i++) { - if (local->state[i]) { - STACK_WIND (frame, - ha_mkdir_lookup_cbk, - children[i], - children[i]->fops->lookup, - &local->stub->args.mkdir.loc, - 0); - if (--cnt == 0) - break; - } - } - return 0; -} - -int32_t -ha_mkdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - int child_count = 0, i = 0; - char *stateino = NULL; - int32_t op_errno = EINVAL; - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - - frame->local = local = GF_CALLOC (1, sizeof (*local), - gf_ha_mt_ha_local_t); - if (!frame->local) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - op_errno = ENOMEM; - goto err; - } - - local->stub = fop_mkdir_stub (frame, ha_mkdir, loc, mode); - if (!local->stub) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - op_errno = ENOMEM; - goto err; - } - - local->op_ret = -1; - local->op_errno = ENOTCONN; - local->state = GF_CALLOC (1, child_count, gf_ha_mt_char); - if (!local->state) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - op_errno = ENOMEM; - goto err; - } - - memcpy (local->state, pvt->state, child_count); - local->active = -1; - - stateino = GF_CALLOC (1, child_count, gf_ha_mt_char); - if (!stateino) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - op_errno = ENOMEM; - goto err; - } - - inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino); - for (i = 0; i < child_count; i++) { - if (local->state[i]) { - local->call_count++; - if (local->active == -1) - local->active = i; - } - } - - STACK_WIND (frame, - ha_mkdir_cbk, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->mkdir, - loc, mode); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL); - ha_local_wipe (local); - return 0; -} - - int32_t -ha_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - if (ret == 0) { - STACK_UNWIND (frame, op_ret, op_errno, preparent, postparent); - } - return 0; -} - -int32_t -ha_unlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_inode (frame, loc->inode); - - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_unlink_stub (frame, ha_unlink, loc); - if (!local->stub) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - op_errno = ENOMEM; - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_unlink_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->unlink, - loc); - return 0; -err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - return 0; -} - - int32_t -ha_rmdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno, - preparent, - postparent); - } - return 0; -} - -int32_t -ha_rmdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - ha_local_t *local = frame->local; - int op_errno = 0; - - op_errno = ha_alloc_init_inode (frame, loc->inode); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_rmdir_stub (frame, ha_rmdir, loc); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_rmdir_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->rmdir, - loc); - return 0; -err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int -ha_symlink_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - char *stateino = NULL; - int child_count = 0, i = 0, cnt = 0; - call_frame_t *prev_frame = NULL; - xlator_t **children = NULL; - uint64_t tmp_stateino = 0; - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - prev_frame = cookie; - children = pvt->children; - - for (i = 0; i < child_count; i++) - if (prev_frame->this == children[i]) - break; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.symlink.loc.path, op_ret, op_errno); - } - inode_ctx_get (local->stub->args.symlink.loc.inode, - this, &tmp_stateino); - stateino = (char *)(long)tmp_stateino; - - if (op_ret == 0) - stateino[i] = 1; - - LOCK (&frame->lock); - cnt = --local->call_count; - UNLOCK (&frame->lock); - - if (cnt == 0) { - call_stub_t *stub = local->stub; - GF_FREE (local->state); - STACK_UNWIND (frame, - local->op_ret, - local->op_errno, - local->stub->args.symlink.loc.inode, &local->buf, - &local->preparent, &local->postparent); - call_stub_destroy (stub); - } - return 0; -} - -int32_t -ha_symlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - char *stateino = NULL; - int child_count = 0, i = 0, cnt = 0; - call_frame_t *prev_frame = NULL; - xlator_t **children = NULL; - uint64_t tmp_stateino = 0; - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - prev_frame = cookie; - children = pvt->children; - - for (i = 0; i < child_count; i++) - if (prev_frame->this == children[i]) - break; - - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.symlink.loc.path, op_ret, op_errno); - } - inode_ctx_get (local->stub->args.symlink.loc.inode, - this, &tmp_stateino); - stateino = (char *)(long)tmp_stateino; - - if (op_ret == 0) { - stateino[i] = 1; - local->op_ret = 0; - local->first_success = 1; - local->buf = *buf; - local->preparent = *preparent; - local->postparent = *postparent; - } - cnt = --local->call_count; - for (i = local->active + 1; i < child_count; i++) { - if (local->state[i]) - break; - } - - if (cnt == 0 || i == child_count) { - call_stub_t *stub = local->stub; - GF_FREE (local->state); - stub = local->stub; - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->stub->args.symlink.loc.inode, &local->buf, - &local->preparent, &local->postparent); - call_stub_destroy (stub); - return 0; - } - - local->active = i; - - if (local->first_success == 0) { - STACK_WIND (frame, - ha_symlink_cbk, - children[i], - children[i]->fops->symlink, - local->stub->args.symlink.linkname, - &local->stub->args.symlink.loc); - return 0; - } - cnt = local->call_count; - - for (; i < child_count; i++) { - if (local->state[i]) { - STACK_WIND (frame, - ha_symlink_lookup_cbk, - children[i], - children[i]->fops->lookup, - &local->stub->args.symlink.loc, - 0); - if (--cnt == 0) - break; - } - } - return 0; -} - -int32_t -ha_symlink (call_frame_t *frame, - xlator_t *this, - const char *linkname, - loc_t *loc) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - int child_count = 0, i = 0; - char *stateino = NULL; - int32_t op_errno = EINVAL; - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - - frame->local = local = GF_CALLOC (1, sizeof (*local), - gf_ha_mt_ha_local_t); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - local->stub = fop_symlink_stub (frame, ha_symlink, linkname, loc); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - local->op_ret = -1; - local->op_errno = ENOTCONN; - local->state = GF_CALLOC (1, child_count, gf_ha_mt_char); - if (!local->state) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - memcpy (local->state, pvt->state, child_count); - local->active = -1; - - stateino = GF_CALLOC (1, child_count, gf_ha_mt_char); - if (!stateino) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino); - - for (i = 0; i < child_count; i++) { - if (local->state[i]) { - local->call_count++; - if (local->active == -1) { - local->active = i; - } - } - } - - STACK_WIND (frame, - ha_symlink_cbk, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->symlink, - linkname, loc); - return 0; -err: - local = frame->local; - frame->local = NULL; - STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL); - ha_local_wipe (local); - return 0; -} - - int32_t -ha_rename_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf, - struct iatt *preoldparent, - struct iatt *postoldparent, - struct iatt *prenewparent, - struct iatt *postnewparent) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, op_ret, op_errno, buf, preoldparent, - postoldparent, prenewparent, postnewparent); - } - return 0; -} - -int32_t -ha_rename (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_inode (frame, oldloc->inode); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_rename_stub (frame, ha_rename, oldloc, newloc); - if (!local->stub) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - op_errno = ENOMEM; - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_rename_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->rename, - oldloc, newloc); - return 0; -err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - return 0; -} - -int -ha_link_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - char *stateino = NULL; - int child_count = 0, i = 0, cnt = 0; - call_frame_t *prev_frame = NULL; - xlator_t **children = NULL; - uint64_t tmp_stateino = 0; - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - prev_frame = cookie; - children = pvt->children; - - for (i = 0; i < child_count; i++) - if (prev_frame->this == children[i]) - break; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.link.newloc.path, op_ret, op_errno); - } - inode_ctx_get (local->stub->args.link.newloc.inode, - this, &tmp_stateino); - stateino = (char *)(long)tmp_stateino; - - if (op_ret == 0) - stateino[i] = 1; - - LOCK (&frame->lock); - cnt = --local->call_count; - UNLOCK (&frame->lock); - - if (cnt == 0) { - call_stub_t *stub = local->stub; - GF_FREE (local->state); - STACK_UNWIND (frame, - local->op_ret, - local->op_errno, - local->stub->args.link.oldloc.inode, &local->buf, - &local->preparent, &local->postparent); - call_stub_destroy (stub); - } - return 0; -} - -int32_t -ha_link_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - char *stateino = NULL; - int child_count = 0, i = 0, cnt = 0; - call_frame_t *prev_frame = NULL; - xlator_t **children = NULL; - uint64_t tmp_stateino = 0; - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - prev_frame = cookie; - children = pvt->children; - - for (i = 0; i < child_count; i++) - if (prev_frame->this == children[i]) - break; - - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.link.newloc.path, op_ret, op_errno); - } - inode_ctx_get (local->stub->args.link.newloc.inode, - this, &tmp_stateino); - stateino = (char *)(long)tmp_stateino; - - if (op_ret == 0) { - stateino[i] = 1; - local->op_ret = 0; - local->first_success = 1; - local->buf = *buf; - local->preparent = *preparent; - local->postparent = *postparent; - } - cnt = --local->call_count; - for (i = local->active + 1; i < child_count; i++) { - if (local->state[i]) - break; - } - - if (cnt == 0 || i == child_count) { - call_stub_t *stub = local->stub; - GF_FREE (local->state); - stub = local->stub; - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local->stub->args.link.oldloc.inode, &local->buf, - &local->preparent, &local->postparent); - call_stub_destroy (stub); - return 0; - } - - local->active = i; - - if (local->first_success == 0) { - STACK_WIND (frame, - ha_link_cbk, - children[i], - children[i]->fops->link, - &local->stub->args.link.oldloc, - &local->stub->args.link.newloc); - return 0; - } - cnt = local->call_count; - - for (; i < child_count; i++) { - if (local->state[i]) { - STACK_WIND (frame, - ha_link_lookup_cbk, - children[i], - children[i]->fops->lookup, - &local->stub->args.link.newloc, - 0); - if (--cnt == 0) - break; - } - } - return 0; -} - -int32_t -ha_link (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - int child_count = 0, i = 0; - char *stateino = NULL; - int32_t ret = 0, op_errno = 0; - uint64_t tmp_stateino = 0; - - ret = inode_ctx_get (newloc->inode, this, &tmp_stateino); - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()"); - } - stateino = (char *)(long)tmp_stateino; - - if (stateino == NULL) { - gf_log (this->name, GF_LOG_ERROR, - "newloc->inode's ctx is NULL, returning EINVAL"); - STACK_UNWIND (frame, -1, EINVAL, oldloc->inode, NULL, NULL, - NULL); - return 0; - } - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - - frame->local = local = GF_CALLOC (1, sizeof (*local), - gf_ha_mt_ha_local_t); - if (!frame->local) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - op_errno = ENOMEM; - goto err; - } - - local->stub = fop_link_stub (frame, ha_link, oldloc, newloc); - if (!local->stub) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - op_errno = ENOMEM; - goto err; - } - - local->op_ret = -1; - local->op_errno = ENOTCONN; - local->state = GF_CALLOC (1, child_count, gf_ha_mt_char); - if (!local->state) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - op_errno = ENOMEM; - goto err; - } - - memcpy (local->state, pvt->state, child_count); - local->active = -1; - - for (i = 0; i < child_count; i++) { - if (local->state[i]) { - local->call_count++; - if (local->active == -1) - local->active = i; - } - } - - STACK_WIND (frame, - ha_link_cbk, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->link, - oldloc, - newloc); - return 0; -err: - local = frame->local; - frame->local = NULL; - STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL); - return 0; -} - -int32_t -ha_create_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - int i, child_count = 0, cnt = 0, ret = 0; - char *stateino = NULL; - hafd_t *hafdp = NULL; - call_frame_t *prev_frame = NULL; - xlator_t **children = NULL; - uint64_t tmp_stateino = 0; - uint64_t tmp_hafdp = 0; - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - prev_frame = cookie; - children = pvt->children; - - ret = inode_ctx_get (local->stub->args.create.loc.inode, - this, &tmp_stateino); - stateino = (char *)(long)tmp_stateino; - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "dict_to_ptr() error"); - /* FIXME: handle */ - } - ret = fd_ctx_get (local->stub->args.create.fd, this, &tmp_hafdp); - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "dict_to_ptr() error"); - /* FIXME: handle */ - } - hafdp = (hafd_t *)(long)tmp_hafdp; - - for (i = 0; i < child_count; i++) { - if (prev_frame->this == children[i]) - break; - } - - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.create.loc.path, op_ret, op_errno); - } - if (op_ret != -1) { - stateino[i] = 1; - hafdp->fdstate[i] = 1; - if (local->op_ret == -1) { - local->op_ret = 0; - local->buf = *buf; - local->preparent = *preparent; - local->postparent = *postparent; - local->first_success = 1; - } - local->stub->args.create.flags &= (~O_EXCL); - } - LOCK (&frame->lock); - cnt = --local->call_count; - UNLOCK (&frame->lock); - - for (i = local->active + 1; i < child_count; i++) { - if (local->state[i]) - break; - } - - if (cnt == 0 || i == child_count) { - char *state = local->state; - call_stub_t *stub = local->stub; - STACK_UNWIND (frame, local->op_ret, local->op_errno, - stub->args.create.fd, - stub->args.create.loc.inode, &local->buf, - &local->preparent, &local->postparent); - GF_FREE (state); - call_stub_destroy (stub); - return 0; - } - local->active = i; - cnt = local->call_count; - for (; i < child_count; i++) { - if (local->state[i]) { - STACK_WIND (frame, - ha_create_cbk, - children[i], - children[i]->fops->create, - &local->stub->args.create.loc, - local->stub->args.create.flags, - local->stub->args.create.mode, - local->stub->args.create.fd); - if ((local->first_success == 0) || (cnt == 0)) - break; - } - } - return 0; -} - -int32_t -ha_create (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - mode_t mode, fd_t *fd) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - int i, child_count = 0; - char *stateino = NULL; - xlator_t **children = NULL; - hafd_t *hafdp = NULL; - int32_t op_errno = EINVAL; - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - children = pvt->children; - - if (local == NULL) { - frame->local = local = GF_CALLOC (1, sizeof (*local), - gf_ha_mt_ha_local_t); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - local->stub = fop_create_stub (frame, ha_create, loc, flags, mode, fd); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - local->state = GF_CALLOC (1, child_count, gf_ha_mt_char); - if (!local->state) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - local->active = -1; - local->op_ret = -1; - local->op_errno = ENOTCONN; - memcpy (local->state, pvt->state, child_count); - - for (i = 0; i < pvt->child_count; i++) { - if (local->state[i]) { - local->call_count++; - if (local->active == -1) - local->active = i; - } - } - /* FIXME handle active -1 */ - stateino = GF_CALLOC (1, child_count, gf_ha_mt_char); - if (!stateino) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - hafdp = GF_CALLOC (1, sizeof (*hafdp), gf_ha_mt_hafd_t); - if (!hafdp) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - hafdp->fdstate = GF_CALLOC (1, child_count, gf_ha_mt_char); - if (!hafdp->fdstate) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - hafdp->path = gf_strdup(loc->path); - if (!hafdp->path) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - LOCK_INIT (&hafdp->lock); - fd_ctx_set (fd, this, (uint64_t)(long)hafdp); - inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino); - } - - STACK_WIND (frame, - ha_create_cbk, - children[local->active], - children[local->active]->fops->create, - loc, flags, mode, fd); - return 0; -err: - local = frame->local; - frame->local = NULL; - STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - ha_local_wipe (local); - - if (stateino) { - GF_FREE (stateino); - stateino = NULL; - } - - if (hafdp) { - if (hafdp->fdstate) { - GF_FREE (hafdp->fdstate); - } - - if (hafdp->path) { - GF_FREE (hafdp->path); - } - - GF_FREE (hafdp); - } - - return 0; -} - - int32_t -ha_open_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - xlator_t **children = NULL; - int i = 0, child_count = 0, callcnt = 0, ret = 0; - call_frame_t *prev_frame = NULL; - hafd_t *hafdp = NULL; - uint64_t tmp_hafdp = 0; - - local = frame->local; - pvt = this->private; - children = pvt->children; - child_count = pvt->child_count; - prev_frame = cookie; - - ret = fd_ctx_get (local->fd, this, &tmp_hafdp); - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()"); - } - hafdp = (hafd_t *)(long)tmp_hafdp; - - for (i = 0; i < child_count; i++) - if (children[i] == prev_frame->this) - break; - LOCK (&frame->lock); - if (op_ret != -1) { - hafdp->fdstate[i] = 1; - local->op_ret = 0; - } - if (op_ret == -1 && op_errno != ENOTCONN) - local->op_errno = op_errno; - callcnt = --local->call_count; - UNLOCK (&frame->lock); - - if (callcnt == 0) { - STACK_UNWIND (frame, - local->op_ret, - local->op_errno, - local->fd); - } - return 0; -} - -int32_t -ha_open (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, fd_t *fd, int wbflags) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - char *stateino = NULL; - xlator_t **children = NULL; - int cnt = 0, i, child_count = 0, ret = 0; - hafd_t *hafdp = NULL; - uint64_t tmp_stateino = 0; - int32_t op_errno = ENOMEM; - - local = frame->local; - pvt = this->private; - children = pvt->children; - child_count = pvt->child_count; - - - frame->local = local = GF_CALLOC (1, sizeof (*local), - gf_ha_mt_ha_local_t); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - local->op_ret = -1; - local->op_errno = ENOTCONN; - local->fd = fd; - - hafdp = GF_CALLOC (1, sizeof (*hafdp), gf_ha_mt_hafd_t); - if (!hafdp) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - hafdp->fdstate = GF_CALLOC (1, child_count, gf_ha_mt_char); - if (!hafdp->fdstate) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - hafdp->path = gf_strdup (loc->path); - if (!hafdp->path) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - hafdp->active = -1; - if (pvt->pref_subvol == -1) { - hafdp->active = fd->inode->ino % child_count; - } - - LOCK_INIT (&hafdp->lock); - fd_ctx_set (fd, this, (uint64_t)(long)hafdp); - ret = inode_ctx_get (loc->inode, this, &tmp_stateino); - stateino = (char *)(long)tmp_stateino; - - for (i = 0; i < child_count; i++) - if (stateino[i]) - cnt++; - local->call_count = cnt; - for (i = 0; i < child_count; i++) { - if (stateino[i]) { - STACK_WIND (frame, - ha_open_cbk, - children[i], - children[i]->fops->open, - loc, flags, fd, wbflags); - if (--cnt == 0) - break; - } - } - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno, fd); - if (hafdp) { - if (hafdp->fdstate) { - GF_FREE (hafdp->fdstate); - hafdp->fdstate = NULL; - } - - if (hafdp->path) { - GF_FREE (hafdp->path); - hafdp->path = NULL; - } - - GF_FREE (hafdp); - } - - ha_local_wipe (local); - return 0; -} - - int32_t -ha_readv_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iovec *vector, - int32_t count, - struct iatt *stbuf, - struct iobref *iobref) -{ - int ret = 0; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno, - vector, - count, - stbuf, - iobref); - } - return 0; -} - -int32_t -ha_readv (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_fd (frame, fd); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_readv_stub (frame, ha_readv, fd, size, offset); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_readv_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->readv, - fd, - size, - offset); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL, NULL); - - ha_local_wipe (local); - return 0; -} - - int32_t -ha_writev_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - int ret = 0; - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno, - prebuf, - postbuf); - } - return 0; -} - -int32_t -ha_writev (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iovec *vector, - int32_t count, - off_t off, - struct iobref *iobref) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_fd (frame, fd); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_writev_stub (frame, ha_writev, fd, vector, count, off, - iobref); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_writev_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->writev, - fd, - vector, - count, - off, - iobref); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - ha_local_wipe (local); - return 0; -} - - int32_t -ha_flush_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int ret = 0; - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno); - } - return 0; -} - -int32_t -ha_flush (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_fd (frame, fd); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_flush_stub (frame, ha_flush, fd); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_flush_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->flush, - fd); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno); - - ha_local_wipe (local); - return 0; -} - - - int32_t -ha_fsync_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - int ret = 0; - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno); - } - return 0; -} - -int32_t -ha_fsync (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_fd (frame, fd); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_fsync_stub (frame, ha_fsync, fd, flags); - if (!local->stub) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - op_errno = ENOMEM; - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_fsync_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->fsync, - fd, - flags); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno); - - ha_local_wipe (local); - return 0; -} - - int32_t -ha_fstat_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf) -{ - int ret = 0; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno, - buf); - } - return 0; -} - -int32_t -ha_fstat (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_fd (frame, fd); - - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_fstat_stub (frame, ha_fstat, fd); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_fstat_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->fstat, - fd); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno, NULL); - - ha_local_wipe (local); - return 0; -} - -int32_t -ha_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - xlator_t **children = NULL; - int i = 0, child_count = 0, callcnt = 0, ret = 0; - call_frame_t *prev_frame = NULL; - hafd_t *hafdp = NULL; - uint64_t tmp_hafdp = 0; - - local = frame->local; - pvt = this->private; - children = pvt->children; - child_count = pvt->child_count; - prev_frame = cookie; - - ret = fd_ctx_get (local->fd, this, &tmp_hafdp); - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()"); - } - hafdp = (hafd_t *)(long)tmp_hafdp; - - for (i = 0; i < child_count; i++) - if (children[i] == prev_frame->this) - break; - LOCK (&frame->lock); - if (op_ret != -1) { - hafdp->fdstate[i] = 1; - local->op_ret = 0; - } - if (op_ret == -1 && op_errno != ENOTCONN) - local->op_errno = op_errno; - callcnt = --local->call_count; - UNLOCK (&frame->lock); - - if (callcnt == 0) { - STACK_UNWIND (frame, - local->op_ret, - local->op_errno, - local->fd); - } - return 0; -} - -int32_t -ha_opendir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, fd_t *fd) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - char *stateino = NULL; - xlator_t **children = NULL; - int cnt = 0, i, child_count = 0, ret = 0; - hafd_t *hafdp = NULL; - uint64_t tmp_stateino = 0; - int32_t op_errno = EINVAL; - - local = frame->local; - pvt = this->private; - children = pvt->children; - child_count = pvt->child_count; - - frame->local = local = GF_CALLOC (1, sizeof (*local), - gf_ha_mt_ha_local_t); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - local->op_ret = -1; - local->op_errno = ENOTCONN; - local->fd = fd; - - hafdp = GF_CALLOC (1, sizeof (*hafdp), gf_ha_mt_hafd_t); - if (!hafdp) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - hafdp->fdstate = GF_CALLOC (1, child_count, gf_ha_mt_char); - if (!hafdp->fdstate) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - hafdp->path = gf_strdup (loc->path); - if (!hafdp->path) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - LOCK_INIT (&hafdp->lock); - fd_ctx_set (fd, this, (uint64_t)(long)hafdp); - ret = inode_ctx_get (loc->inode, this, &tmp_stateino); - stateino = (char *)(long)tmp_stateino; - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "inode_ctx_get() error"); - } - for (i = 0; i < child_count; i++) - if (stateino[i]) - cnt++; - local->call_count = cnt; - for (i = 0; i < child_count; i++) { - if (stateino[i]) { - STACK_WIND (frame, - ha_opendir_cbk, - children[i], - children[i]->fops->opendir, - loc, fd); - if (--cnt == 0) - break; - } - } - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno, NULL); - ha_local_wipe (local); - if (hafdp) { - if (hafdp->fdstate) { - GF_FREE (hafdp->fdstate); - hafdp->fdstate = NULL; - } - - if (hafdp->path) { - GF_FREE (hafdp->path); - hafdp->path = NULL; - } - - GF_FREE (hafdp); - } - return 0; -} - - int32_t -ha_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entries, - int32_t count) -{ - int ret = 0; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno, - entries, - count); - } - return 0; -} - -int32_t -ha_getdents (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset, - int32_t flag) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_fd (frame, fd); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_getdents_stub (frame, ha_getdents, fd, size, offset, - flag); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_getdents_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->getdents, - fd, - size, - offset, - flag); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno, NULL, 0); - - ha_local_wipe (local); - return 0; -} - - int32_t -ha_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int ret = 0; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno); - } - return 0; -} - -int32_t -ha_setdents (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags, - dir_entry_t *entries, - int32_t count) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_fd (frame, fd); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - - local->stub = fop_setdents_stub (frame, ha_setdents, fd, flags, entries, - count); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_setdents_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->setdents, - fd, - flags, - entries, - count); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno); - - ha_local_wipe (local); - return 0; -} - - int32_t -ha_fsyncdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int ret = 0; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno); - } - return 0; -} - -int32_t -ha_fsyncdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_fd (frame, fd); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_fsyncdir_stub (frame, ha_fsyncdir, fd, flags); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_fsyncdir_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->fsyncdir, - fd, - flags); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - ha_local_wipe (local); - return 0; -} - - - int32_t -ha_statfs_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct statvfs *buf) -{ - ha_local_t *local = NULL; - ha_private_t *priv = NULL; - - local = frame->local; - if (-1 == op_ret) { - local->active = (local->active + 1) % priv->child_count; - local->tries--; - if (!local->tries) - goto out; - - STACK_WIND (frame, ha_statfs_cbk, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->statfs, - &local->loc); - return 0; - } - - out: - loc_wipe (&local->loc); - STACK_UNWIND (frame, op_ret, op_errno, buf); - - return 0; -} - -int32_t -ha_statfs (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - ha_private_t *priv = NULL; - ha_local_t *local = NULL; - int op_errno = 0; - - /* The normal way of handling failover doesn't work here - * as loc->inode may be null in this case. - */ - local = GF_CALLOC (1, sizeof (*local), - gf_ha_mt_ha_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - priv = this->private; - frame->local = local; - local->active = priv->pref_subvol; - if (-1 == local->active) - local->active = 0; - local->tries = priv->child_count; - loc_copy (&local->loc, loc); - - STACK_WIND (frame, ha_statfs_cbk, HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->statfs, loc); - return 0; -err: - STACK_UNWIND (frame, -1, op_errno, NULL); - return 0; -} - - int32_t -ha_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno); - } - return 0; -} - -int32_t -ha_setxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *dict, - int32_t flags) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_inode (frame, loc->inode); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_setxattr_stub (frame, ha_setxattr, loc, dict, flags); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_setxattr_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->setxattr, - loc, - dict, - flags); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno); - - ha_local_wipe (local); - return 0; -} - - int32_t -ha_getxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *dict) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno, - dict); - } - return 0; -} - -int32_t -ha_getxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_inode (frame, loc->inode); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_getxattr_stub (frame, ha_getxattr, loc, name); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_getxattr_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->getxattr, - loc, - name); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno, NULL); - - ha_local_wipe (local); - return 0; -} - -int32_t -ha_xattrop_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *dict) -{ - int ret = -1; - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - if (ret == 0) { - STACK_UNWIND (frame, op_ret, op_errno, dict); - } - return 0; -} - - -int32_t -ha_xattrop (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - gf_xattrop_flags_t flags, - dict_t *dict) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_inode (frame, loc->inode); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - - local->stub = fop_xattrop_stub (frame, ha_xattrop, loc, flags, dict); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_xattrop_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->xattrop, - loc, - flags, - dict); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno, dict); - - ha_local_wipe (local); - return 0; -} - -int32_t -ha_fxattrop_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *dict) -{ - int ret = -1; - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - if (ret == 0) - STACK_UNWIND (frame, op_ret, op_errno, dict); - return 0; -} - -int32_t -ha_fxattrop (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - gf_xattrop_flags_t flags, - dict_t *dict) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_fd (frame, fd); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_fxattrop_stub (frame, ha_fxattrop, fd, flags, dict); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_fxattrop_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->fxattrop, - fd, - flags, - dict); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno, dict); - - ha_local_wipe (local); - return 0; -} - - int32_t -ha_removexattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int ret = -1; - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno); - } - return 0; -} - -int32_t -ha_removexattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_inode (frame, loc->inode); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - - local->stub = fop_removexattr_stub (frame, ha_removexattr, loc, name); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_removexattr_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->removexattr, - loc, - name); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno); - - ha_local_wipe (local); - return 0; -} - -int32_t -ha_lk_setlk_unlck_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct gf_flock *lock) -{ - ha_local_t *local = NULL; - int cnt = 0; - call_stub_t *stub = NULL; - - local = frame->local; - - LOCK (&frame->lock); - cnt = --local->call_count; - if (op_ret == 0) - local->op_ret = 0; - UNLOCK (&frame->lock); - - if (cnt == 0) { - stub = local->stub; - GF_FREE (local->state); - if (stub->args.lk.lock.l_type == F_UNLCK) { - STACK_UNWIND (frame, local->op_ret, local->op_errno, &stub->args.lk.lock); - } else { - STACK_UNWIND (frame, -1, EIO, NULL); - } - call_stub_destroy (stub); - } - return 0; -} - -int32_t -ha_lk_setlk_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct gf_flock *lock) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - xlator_t **children = NULL; - int i = 0, cnt = 0, j = 0; - int child_count = 0; - call_frame_t *prev_frame = NULL; - char *state = NULL; - - local = frame->local; - pvt = this->private; - children = pvt->children; - child_count = pvt->child_count; - prev_frame = cookie; - state = local->state; - - if (op_ret == 0) - local->op_ret = 0; - - if ((op_ret == 0) || (op_ret == -1 && op_errno == ENOTCONN)) { - for (i = 0; i < child_count; i++) { - if (prev_frame->this == cookie) - break; - } - i++; - for (; i < child_count; i++) { - if (local->state[i]) - break; - } - if (i == child_count) { - call_stub_t *stub = local->stub; - GF_FREE (local->state); - STACK_UNWIND (frame, 0, op_errno, &stub->args.lk.lock); - call_stub_destroy (stub); - return 0; - } - STACK_WIND (frame, - ha_lk_setlk_cbk, - children[i], - children[i]->fops->lk, - local->stub->args.lk.fd, - local->stub->args.lk.cmd, - &local->stub->args.lk.lock); - return 0; - } else { - for (i = 0; i < child_count; i++) { - if (prev_frame->this == cookie) - break; - } - cnt = 0; - for (j = 0; j < i; j++) { - if (state[i]) - cnt++; - } - if (cnt) { - struct gf_flock lock; - lock = local->stub->args.lk.lock; - for (i = 0; i < child_count; i++) { - if (state[i]) { - STACK_WIND (frame, - ha_lk_setlk_unlck_cbk, - children[i], - children[i]->fops->lk, - local->stub->args.lk.fd, - local->stub->args.lk.cmd, - &lock); - if (--cnt == 0) - break; - } - } - return 0; - } else { - GF_FREE (local->state); - call_stub_destroy (local->stub); - STACK_UNWIND (frame, - op_ret, - op_errno, - lock); - return 0; - } - } -} - -int32_t -ha_lk_getlk_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct gf_flock *lock) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - fd_t *fd = NULL; - int child_count = 0, i = 0; - xlator_t **children = NULL; - call_frame_t *prev_frame = NULL; - - local = frame->local; - pvt = this->private; - fd = local->stub->args.lk.fd; - child_count = pvt->child_count; - children = pvt->children; - prev_frame = cookie; - - if (op_ret == 0) { - GF_FREE (local->state); - call_stub_destroy (local->stub); - STACK_UNWIND (frame, 0, 0, lock); - return 0; - } - - for (i = 0; i < child_count; i++) { - if (prev_frame->this == children[i]) - break; - } - - for (; i < child_count; i++) { - if (local->state[i]) - break; - } - - if (i == child_count) { - GF_FREE (local->state); - call_stub_destroy (local->stub); - STACK_UNWIND (frame, op_ret, op_errno, lock); - return 0; - } - - STACK_WIND (frame, - ha_lk_getlk_cbk, - children[i], - children[i]->fops->lk, - fd, - local->stub->args.lk.cmd, - &local->stub->args.lk.lock); - return 0; -} - -int32_t -ha_lk (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t cmd, - struct gf_flock *lock) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - hafd_t *hafdp = NULL; - char *state = NULL; - int child_count = 0, i = 0, cnt = 0, ret = 0; - xlator_t **children = NULL; - uint64_t tmp_hafdp = 0; - int32_t op_errno = EINVAL; - - local = frame->local; - pvt = this->private; - child_count = pvt->child_count; - children = pvt->children; - ret = fd_ctx_get (fd, this, &tmp_hafdp); - if (ret < 0) - gf_log (this->name, GF_LOG_ERROR, "fd_ctx_get failed"); - - if (local == NULL) { - local = frame->local = GF_CALLOC (1, sizeof (*local), - gf_ha_mt_ha_local_t); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - local->active = -1; - local->op_ret = -1; - local->op_errno = ENOTCONN; - } - hafdp = (hafd_t *)(long)tmp_hafdp; - - if (local->active == -1) { - op_errno = ENOTCONN; - goto err; - } - - local->stub = fop_lk_stub (frame, ha_lk, fd, cmd, lock); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - local->state = GF_CALLOC (1, child_count, gf_ha_mt_char); - if (!local->state) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - state = hafdp->fdstate; - LOCK (&hafdp->lock); - memcpy (local->state, state, child_count); - UNLOCK (&hafdp->lock); - if (cmd == F_GETLK) { - for (i = 0; i < child_count; i++) { - if (local->state[i]) - break; - } - STACK_WIND (frame, - ha_lk_getlk_cbk, - children[i], - children[i]->fops->lk, - fd, - cmd, - lock); - } else if (cmd == F_SETLK && lock->l_type == F_UNLCK) { - for (i = 0; i < child_count; i++) { - if (local->state[i]) - local->call_count++; - } - cnt = local->call_count; - for (i = 0; i < child_count; i++) { - if (local->state[i]) { - STACK_WIND (frame, - ha_lk_setlk_unlck_cbk, - children[i], - children[i]->fops->lk, - fd, cmd, lock); - if (--cnt == 0) - break; - } - } - } else { - for (i = 0; i < child_count; i++) { - if (local->state[i]) - break; - } - STACK_WIND (frame, - ha_lk_setlk_cbk, - children[i], - children[i]->fops->lk, - fd, - cmd, - lock); - } - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno, NULL); - - ha_local_wipe (local); - return 0; -} - - int32_t -ha_inode_entry_lk_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno); - } - return 0; -} - -int32_t -ha_inodelk (call_frame_t *frame, - xlator_t *this, - const char *volume, - loc_t *loc, - int32_t cmd, - struct gf_flock *lock) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_inode (frame, loc->inode); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_inodelk_stub (frame, ha_inodelk, volume, - loc, cmd, lock); - STACK_WIND_COOKIE (frame, - ha_inode_entry_lk_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->inodelk, - volume, - loc, - cmd, - lock); - return 0; -err: - STACK_UNWIND (frame, -1, op_errno); - return 0; -} - -int32_t -ha_entrylk (call_frame_t *frame, - xlator_t *this, - const char *volume, - loc_t *loc, - const char *basename, - entrylk_cmd cmd, - entrylk_type type) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_inode (frame, loc->inode); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_entrylk_stub (frame, ha_entrylk, volume, - loc, basename, cmd, type); - STACK_WIND_COOKIE (frame, - ha_inode_entry_lk_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->entrylk, - volume, loc, basename, cmd, type); - return 0; -err: - STACK_UNWIND (frame, -1, op_errno); - return 0; -} - - int32_t -ha_checksum_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - uint8_t *file_checksum, - uint8_t *dir_checksum) -{ - int ret = -1; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - if (ret == 0) { - STACK_UNWIND (frame, - op_ret, - op_errno, - file_checksum, - dir_checksum); - } - return 0; -} - -int32_t -ha_checksum (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flag) -{ - int op_errno = 0; - ha_local_t *local = NULL; - - op_errno = ha_alloc_init_inode (frame, loc->inode); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - local->stub = fop_checksum_stub (frame, ha_checksum, loc, flag); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - STACK_WIND_COOKIE (frame, - ha_checksum_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->checksum, - loc, - flag); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - ha_local_wipe (local); - return 0; -} - -int32_t -ha_common_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries) -{ - int ret = 0; - - ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); - if (ret == 0) - STACK_UNWIND (frame, op_ret, op_errno, entries); - return 0; -} - -int32_t -ha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t off); - -int32_t -ha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t off); -int32_t -ha_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t off, int whichop) -{ - ha_local_t *local = NULL; - int op_errno = 0; - - op_errno = ha_alloc_init_fd (frame, fd); - if (op_errno < 0) { - op_errno = -op_errno; - goto err; - } - local = frame->local; - if (whichop == GF_FOP_READDIR) - local->stub = fop_readdir_stub (frame, ha_readdir, fd, size, - off); - else - local->stub = fop_readdirp_stub (frame, ha_readdirp, fd, size, - off); - if (!local->stub) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - if (whichop == GF_FOP_READDIR) - STACK_WIND_COOKIE (frame, ha_common_readdir_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->readdir, - fd, size, off); - else - STACK_WIND_COOKIE (frame, ha_common_readdir_cbk, - (void *)(long)local->active, - HA_ACTIVE_CHILD(this, local), - HA_ACTIVE_CHILD(this, local)->fops->readdirp, - fd, size, off); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, ENOTCONN, NULL); - - ha_local_wipe (local); - return 0; -} - - -int32_t -ha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t off) -{ - ha_do_readdir (frame, this, fd, size, off, GF_FOP_READDIR); - return 0; -} - -int32_t -ha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t off) -{ - ha_do_readdir (frame, this, fd, size, off, GF_FOP_READDIRP); - return 0; -} - -/* Management operations */ - - int32_t -ha_stats_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct xlator_stats *stats) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - call_frame_t *prev_frame = NULL; - xlator_t **children = NULL; - int i = 0; - - local = frame->local; - pvt = this->private; - prev_frame = cookie; - children = pvt->children; - - if (op_ret == -1 && op_errno == ENOTCONN) { - for (i = 0; i < pvt->child_count; i++) { - if (prev_frame->this == children[i]) - break; - } - i++; - for (; i < pvt->child_count; i++) { - if (pvt->state[i]) - break; - } - - if (i == pvt->child_count) { - STACK_UNWIND (frame, -1, ENOTCONN, NULL); - return 0; - } - STACK_WIND (frame, - ha_stats_cbk, - children[i], - children[i]->mops->stats, - local->flags); - return 0; - } - - STACK_UNWIND (frame, - op_ret, - op_errno, - stats); - return 0; -} - -int32_t -ha_stats (call_frame_t *frame, - xlator_t *this, - int32_t flags) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - xlator_t **children = NULL; - int i = 0; - int32_t op_errno = EINVAL; - - local = frame->local = GF_CALLOC (1, sizeof (*local), - gf_ha_mt_ha_local_t); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - pvt = this->private; - children = pvt->children; - for (i = 0; i < pvt->child_count; i++) { - if (pvt->state[i]) - break; - } - - if (i == pvt->child_count) { - op_errno = ENOTCONN; - goto err; - } - local->flags = flags; - - STACK_WIND (frame, - ha_stats_cbk, - children[i], - children[i]->mops->stats, - flags); - return 0; - -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, ENOTCONN, NULL); - - ha_local_wipe (local); - return 0; - -} - - -int32_t -ha_getspec_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - char *spec_data) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - call_frame_t *prev_frame = NULL; - xlator_t **children = NULL; - int i = 0; - - local = frame->local; - pvt = this->private; - prev_frame = cookie; - children = pvt->children; - - if (op_ret == -1 && op_errno == ENOTCONN) { - for (i = 0; i < pvt->child_count; i++) { - if (prev_frame->this == children[i]) - break; - } - i++; - for (; i < pvt->child_count; i++) { - if (pvt->state[i]) - break; - } - - if (i == pvt->child_count) { - STACK_UNWIND (frame, -1, ENOTCONN, NULL); - return 0; - } - STACK_WIND (frame, - ha_getspec_cbk, - children[i], - children[i]->mops->getspec, - local->pattern, - local->flags); - return 0; - } - - STACK_UNWIND (frame, - op_ret, - op_errno, - spec_data); - return 0; -} - -int32_t -ha_getspec (call_frame_t *frame, - xlator_t *this, - const char *key, - int32_t flags) -{ - ha_local_t *local = NULL; - ha_private_t *pvt = NULL; - xlator_t **children = NULL; - int i = 0; - int32_t op_errno = EINVAL; - - local = frame->local = GF_CALLOC (1, sizeof (*local), - gf_ha_mt_ha_local_t); - if (!local) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto err; - } - - pvt = this->private; - children = pvt->children; - - for (i = 0; i < pvt->child_count; i++) { - if (pvt->state[i]) - break; - } - - if (i == pvt->child_count) { - op_errno = ENOTCONN; - goto err; - } - local->flags = flags; - local->pattern = (char *)key; - - STACK_WIND (frame, - ha_getspec_cbk, - children[i], - children[i]->mops->getspec, - key, flags); - return 0; -err: - local = frame->local; - frame->local = NULL; - - STACK_UNWIND (frame, -1, ENOTCONN, NULL); - - ha_local_wipe (local); - return 0; - -} - -int32_t -ha_closedir (xlator_t *this, - fd_t *fd) -{ - hafd_t *hafdp = NULL; - int op_errno = 0; - uint64_t tmp_hafdp = 0; - - op_errno = fd_ctx_del (fd, this, &tmp_hafdp); - if (op_errno != 0) { - gf_log (this->name, GF_LOG_ERROR, "fd_ctx_del() error"); - return 0; - } - hafdp = (hafd_t *)(long)tmp_hafdp; - - GF_FREE (hafdp->fdstate); - GF_FREE (hafdp->path); - LOCK_DESTROY (&hafdp->lock); - return 0; -} - -int32_t -ha_close (xlator_t *this, - fd_t *fd) -{ - hafd_t *hafdp = NULL; - int op_errno = 0; - uint64_t tmp_hafdp = 0; - - op_errno = fd_ctx_del (fd, this, &tmp_hafdp); - if (op_errno != 0) { - gf_log (this->name, GF_LOG_ERROR, "fd_ctx_del() error"); - return 0; - } - hafdp = (hafd_t *)(long)tmp_hafdp; - - GF_FREE (hafdp->fdstate); - GF_FREE (hafdp->path); - LOCK_DESTROY (&hafdp->lock); - return 0; -} - -/* notify */ -int32_t -notify (xlator_t *this, - int32_t event, - void *data, - ...) -{ - ha_private_t *pvt = NULL; - int32_t i = 0, upcnt = 0; - - pvt = this->private; - if (pvt == NULL) { - gf_log (this->name, GF_LOG_DEBUG, "got notify before init()"); - return 0; - } - - switch (event) - { - case GF_EVENT_CHILD_DOWN: - { - for (i = 0; i < pvt->child_count; i++) { - if (data == pvt->children[i]) - break; - } - gf_log (this->name, GF_LOG_DEBUG, "GF_EVENT_CHILD_DOWN from %s", pvt->children[i]->name); - pvt->state[i] = 0; - for (i = 0; i < pvt->child_count; i++) { - if (pvt->state[i]) - break; - } - if (i == pvt->child_count) { - default_notify (this, event, data); - } - } - break; - case GF_EVENT_CHILD_UP: - { - for (i = 0; i < pvt->child_count; i++) { - if (data == pvt->children[i]) - break; - } - - gf_log (this->name, GF_LOG_DEBUG, "GF_EVENT_CHILD_UP from %s", pvt->children[i]->name); - - pvt->state[i] = 1; - - for (i = 0; i < pvt->child_count; i++) { - if (pvt->state[i]) - upcnt++; - } - - if (upcnt == 1) { - default_notify (this, event, data); - } - } - break; - - default: - { - default_notify (this, event, data); - } - } - - return 0; -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_ha_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; -} - -int -init (xlator_t *this) -{ - ha_private_t *pvt = NULL; - xlator_list_t *trav = NULL; - int count = 0, ret = 0; - - - if (!this->children) { - gf_log (this->name,GF_LOG_ERROR, - "FATAL: ha should have one or more child defined"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - trav = this->children; - pvt = GF_CALLOC (1, sizeof (ha_private_t), gf_ha_mt_ha_private_t); - - ret = dict_get_int32 (this->options, "preferred-subvolume", - &pvt->pref_subvol); - if (ret < 0) { - pvt->pref_subvol = -1; - } - - trav = this->children; - while (trav) { - count++; - trav = trav->next; - } - - pvt->child_count = count; - pvt->children = GF_CALLOC (count, sizeof (xlator_t*), - gf_ha_mt_xlator_t); - - trav = this->children; - count = 0; - while (trav) { - pvt->children[count] = trav->xlator; - count++; - trav = trav->next; - } - - pvt->state = GF_CALLOC (1, count, gf_ha_mt_char); - this->private = pvt; - return 0; -} - -void -fini (xlator_t *this) -{ - ha_private_t *priv = NULL; - priv = this->private; - GF_FREE (priv); - return; -} - - -struct xlator_fops fops = { - .lookup = ha_lookup, - .stat = ha_stat, - .readlink = ha_readlink, - .mknod = ha_mknod, - .mkdir = ha_mkdir, - .unlink = ha_unlink, - .rmdir = ha_rmdir, - .symlink = ha_symlink, - .rename = ha_rename, - .link = ha_link, - .truncate = ha_truncate, - .create = ha_create, - .open = ha_open, - .readv = ha_readv, - .writev = ha_writev, - .statfs = ha_statfs, - .flush = ha_flush, - .fsync = ha_fsync, - .setxattr = ha_setxattr, - .getxattr = ha_getxattr, - .removexattr = ha_removexattr, - .opendir = ha_opendir, - .readdir = ha_readdir, - .readdirp = ha_readdirp, - .getdents = ha_getdents, - .fsyncdir = ha_fsyncdir, - .access = ha_access, - .ftruncate = ha_ftruncate, - .fstat = ha_fstat, - .lk = ha_lk, - .setdents = ha_setdents, - .lookup_cbk = ha_lookup_cbk, - .checksum = ha_checksum, - .xattrop = ha_xattrop, - .fxattrop = ha_fxattrop, - .setattr = ha_setattr, - .fsetattr = ha_fsetattr, -}; - -struct xlator_cbks cbks = { - .release = ha_close, - .releasedir = ha_closedir, - .forget = ha_forget, -}; diff --git a/xlators/cluster/ha/src/ha.h b/xlators/cluster/ha/src/ha.h deleted file mode 100644 index d6a519fa8b6..00000000000 --- a/xlators/cluster/ha/src/ha.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef __HA_H_ -#define __HA_H_ - -#include "ha-mem-types.h" - -typedef struct { - call_stub_t *stub; - int32_t op_ret, op_errno; - int32_t active, tries, revalidate, revalidate_error; - int32_t call_count; - char *state, *pattern; - dict_t *dict; - loc_t loc; - struct iatt buf; - struct iatt postparent; - struct iatt preparent; - fd_t *fd; - inode_t *inode; - int32_t flags; - int32_t first_success; -} ha_local_t; - -typedef struct { - char *state; - xlator_t **children; - int child_count, pref_subvol; -} ha_private_t; - -typedef struct { - char *fdstate; - char *path; - gf_lock_t lock; - int active; -} hafd_t; - -#define HA_ACTIVE_CHILD(this, local) (((ha_private_t *)this->private)->children[local->active]) - -extern int ha_alloc_init_fd (call_frame_t *frame, fd_t *fd); - -extern int ha_handle_cbk (call_frame_t *frame, void *cookie, int op_ret, int op_errno) ; - -extern int ha_alloc_init_inode (call_frame_t *frame, inode_t *inode); - -#endif diff --git a/xlators/cluster/map/Makefile.am b/xlators/cluster/map/Makefile.am deleted file mode 100644 index d471a3f9243..00000000000 --- a/xlators/cluster/map/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = src - -CLEANFILES = diff --git a/xlators/cluster/map/src/Makefile.am b/xlators/cluster/map/src/Makefile.am deleted file mode 100644 index 26e19137a8b..00000000000 --- a/xlators/cluster/map/src/Makefile.am +++ /dev/null @@ -1,15 +0,0 @@ -xlator_LTLIBRARIES = map.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/cluster - -map_la_LDFLAGS = -module -avoidversion - -map_la_SOURCES = map.c map-helper.c -map_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -noinst_HEADERS = map.h - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) - -CLEANFILES = - diff --git a/xlators/cluster/map/src/map-helper.c b/xlators/cluster/map/src/map-helper.c deleted file mode 100644 index 24ab6eb5060..00000000000 --- a/xlators/cluster/map/src/map-helper.c +++ /dev/null @@ -1,358 +0,0 @@ -/* - Copyright (c) 2009-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "map.h" - - -xlator_t * -map_subvol_next (xlator_t *this, xlator_t *prev) -{ - map_private_t *priv = NULL; - xlator_t *next = NULL; - int i = 0; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (priv->xlarray[i].xl == prev) { - if ((i + 1) < priv->child_count) - next = priv->xlarray[i + 1].xl; - break; - } - } - - return next; -} - -int -map_subvol_cnt (xlator_t *this, xlator_t *subvol) -{ - int i = 0; - int ret = -1; - map_private_t *priv = NULL; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (subvol == priv->xlarray[i].xl) { - ret = i; - break; - } - } - - return ret; -} - -int -map_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) -{ - map_private_t *priv = NULL; - int cnt = 0; - int max = 0; - uint64_t y = 0; - - if (x == ((uint64_t) -1)) { - y = (uint64_t) -1; - goto out; - } - - priv = this->private; - - max = priv->child_count; - cnt = map_subvol_cnt (this, subvol); - - y = ((x * max) + cnt); - -out: - if (y_p) - *y_p = y; - - return 0; -} - - -int -map_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, - uint64_t *x_p) -{ - int cnt = 0; - int max = 0; - uint64_t x = 0; - xlator_t *subvol = 0; - map_private_t *priv = NULL; - - priv = this->private; - max = priv->child_count; - - cnt = y % max; - x = y / max; - - subvol = priv->xlarray[cnt].xl; - - if (subvol_p) - *subvol_p = subvol; - - if (x_p) - *x_p = x; - - return 0; -} - - -xlator_t * -get_mapping_subvol_from_path (xlator_t *this, const char *path) -{ - map_private_t *priv = NULL; - struct map_pattern *map = NULL; - - /* To make sure we handle '/' properly */ - if (!strcmp (path, "/")) - return NULL; - - priv = this->private; - - map = priv->map; - while (map) { - if (!strncmp (map->directory, path, map->dir_len)) { - if ((path[map->dir_len] == '/') || - (path[map->dir_len] == '\0')) { - return map->xl; - } - } - - map = map->next; - } - - return priv->default_xl; -} - -xlator_t * -get_mapping_subvol_from_ctx (xlator_t *this, inode_t *inode) -{ - uint64_t subvol = 0; - int ret = -1; - - ret = inode_ctx_get (inode, this, &subvol); - if (ret != 0) - return NULL; - - return (xlator_t *)(long)subvol; -} - -int -check_multiple_volume_entry (xlator_t *this, - xlator_t *subvol) -{ - int ret = -1; - int idx = 0; - map_private_t *priv = NULL; - - priv = this->private; - - for (idx = 0; idx < priv->child_count; idx++) { - if (priv->xlarray[idx].xl == subvol) { - if (priv->xlarray[idx].mapped) { - gf_log (this->name, GF_LOG_ERROR, - "subvolume '%s' is already mapped", - subvol->name); - goto out; - } - priv->xlarray[idx].mapped = 1; - ret = 0; - goto out; - } - } - - gf_log (this->name, GF_LOG_ERROR, - "subvolume '%s' is not found", - subvol->name); - - out: - return ret; -} - -int -verify_dir_and_assign_subvol (xlator_t *this, - const char *directory, - const char *subvol) -{ - int default_flag = 0; - int ret = -1; - int idx = 0; - map_private_t *priv = NULL; - xlator_list_t *trav = NULL; - struct map_pattern *tmp_map = NULL; - - priv = this->private; - - /* check if directory is valid, ie, its a top level dir, and - * not includes a '*' in it. - */ - if (!strcmp ("*", directory)) { - default_flag = 1; - } else { - if (directory[0] != '/') { - gf_log (this->name, GF_LOG_ERROR, - "map takes absolute path, starting with '/'. " - "not '%s'", directory); - goto out; - } - for (idx = 1; idx < (strlen (directory) - 1); idx++) { - if (directory[idx] == '/') { - gf_log (this->name, GF_LOG_ERROR, - "map takes only top level directory, " - "not '%s'", directory); - goto out; - } - } - } - - /* Assign proper subvolume */ - trav = this->children; - while (trav) { - if (!strcmp (trav->xlator->name, subvol)) { - - /* Check if there is another directory for - * same volume, if yes, return error. - */ - ret = check_multiple_volume_entry (this, - trav->xlator); - if (ret != 0) { - goto out; - } - - ret = 0; - if (default_flag) { - if (priv->default_xl) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "'*' specified more than " - "once. don't confuse me!!!"); - } - - priv->default_xl = trav->xlator; - goto out; - } - - tmp_map = GF_CALLOC (1, sizeof (struct map_pattern), - gf_map_mt_map_pattern); - tmp_map->xl = trav->xlator; - tmp_map->dir_len = strlen (directory); - - /* make sure that the top level directory starts - * with '/' and ends without '/' - */ - tmp_map->directory = gf_strdup (directory); - if (directory[tmp_map->dir_len - 1] == '/') { - tmp_map->dir_len--; - } - - if (!priv->map) - priv->map = tmp_map; - else { - struct map_pattern *trav_map = NULL; - trav_map = priv->map; - while (trav_map->next) - trav_map = trav_map->next; - trav_map->next = tmp_map; - } - - goto out; - } - - trav = trav->next; - } - - gf_log (this->name, GF_LOG_ERROR, - "map volume '%s' is not proper subvolume", subvol); - - out: - return ret; -} - -int -assign_default_subvol (xlator_t *this, const char *default_xl) -{ - int ret = -1; - map_private_t *priv = NULL; - xlator_list_t *trav = NULL; - - priv = this->private; - trav = this->children; - - while (trav) { - if (!strcmp (trav->xlator->name, default_xl)) { - ret = check_multiple_volume_entry (this, - trav->xlator); - if (ret != 0) { - goto out; - } - if (priv->default_xl) - gf_log (this->name, GF_LOG_WARNING, - "default-volume option provided, " - "overriding earlier '*' option"); - priv->default_xl = trav->xlator; - return 0; - } - trav = trav->next; - } - - gf_log (this->name, GF_LOG_ERROR, - "default-volume value is not an valid subvolume. check again"); - out: - return -1; -} - -void -verify_if_all_subvolumes_got_used (xlator_t *this) -{ - int idx = 0; - map_private_t *priv = NULL; - - priv = this->private; - - for (idx = 0; idx < priv->child_count; idx++) { - if (!priv->xlarray[idx].mapped) { - if (!priv->default_xl) { - priv->default_xl = priv->xlarray[idx].xl; - priv->xlarray[idx].mapped = 1; - } else { - gf_log (this->name, GF_LOG_WARNING, - "subvolume '%s' is not mapped to " - "any directory", - priv->xlarray[idx].xl->name); - } - } - } - - if (!priv->default_xl) { - gf_log (this->name, GF_LOG_WARNING, - "default subvolume not specified, filesystem " - "may not work properly. Check 'map' translator " - "documentation for more info"); - } - - return ; -} diff --git a/xlators/cluster/map/src/map-mem-types.h b/xlators/cluster/map/src/map-mem-types.h deleted file mode 100644 index 23c798deaf3..00000000000 --- a/xlators/cluster/map/src/map-mem-types.h +++ /dev/null @@ -1,35 +0,0 @@ - -/* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - - -#ifndef __MAP_MEM_TYPES_H__ -#define __MAP_MEM_TYPES_H__ - -#include "mem-types.h" - -enum gf_map_mem_types_ { - gf_map_mt_map_private_t = gf_common_mt_end + 1, - gf_map_mt_map_local_t, - gf_map_mt_map_xlator_array, - gf_map_mt_map_pattern, - gf_map_mt_end -}; -#endif - diff --git a/xlators/cluster/map/src/map.c b/xlators/cluster/map/src/map.c deleted file mode 100644 index 73186e8cb3a..00000000000 --- a/xlators/cluster/map/src/map.c +++ /dev/null @@ -1,2577 +0,0 @@ -/* - Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "map.h" - -/* TODO : - * -> support for 'get' 'put' API in through xattrs. - * -> define the behavior of notify() - */ - -static int32_t -map_stat_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf) - -{ - call_frame_t *prev = NULL; - prev = cookie; - - map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino); - - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - -static int32_t -map_setattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *statpre, - struct iatt *statpost) -{ - call_frame_t *prev = NULL; - prev = cookie; - - map_itransform (this, prev->this, statpre->ia_ino, &statpre->ia_ino); - map_itransform (this, prev->this, statpost->ia_ino, &statpost->ia_ino); - - STACK_UNWIND (frame, op_ret, op_errno, statpre, statpost); - return 0; -} - -static int32_t -map_fsetattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *statpre, - struct iatt *statpost) -{ - call_frame_t *prev = NULL; - prev = cookie; - - map_itransform (this, prev->this, statpre->ia_ino, &statpre->ia_ino); - map_itransform (this, prev->this, statpost->ia_ino, &statpost->ia_ino); - - STACK_UNWIND (frame, op_ret, op_errno, statpre, statpost); - return 0; -} - -static int32_t -map_truncate_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - call_frame_t *prev = NULL; - prev = cookie; - - map_itransform (this, prev->this, postbuf->ia_ino, &postbuf->ia_ino); - - STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf); - return 0; -} - -static int32_t -map_ftruncate_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - call_frame_t *prev = NULL; - prev = cookie; - - map_itransform (this, prev->this, postbuf->ia_ino, &postbuf->ia_ino); - - STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf); - return 0; -} - - -static int32_t -map_access_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -static int32_t -map_readlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - const char *path, - struct iatt *sbuf) -{ - STACK_UNWIND (frame, op_ret, op_errno, path, sbuf); - return 0; -} - -static int32_t -map_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - STACK_UNWIND (frame, op_ret, op_errno, preparent, postparent); - return 0; -} - -static int32_t -map_rmdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - STACK_UNWIND (frame, op_ret, op_errno, preparent, postparent); - return 0; -} - - -static int32_t -map_rename_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf, - struct iatt *preoldparent, - struct iatt *postoldparent, - struct iatt *prenewparent, - struct iatt *postnewparent) -{ - call_frame_t *prev = NULL; - prev = cookie; - - map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino); - - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - -static int32_t -map_link_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - call_frame_t *prev = NULL; - prev = cookie; - - map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino); - - STACK_UNWIND (frame, op_ret, op_errno, inode, buf); - return 0; -} - -static int32_t -map_open_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - STACK_UNWIND (frame, op_ret, op_errno, fd); - return 0; -} - -static int32_t -map_readv_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iovec *vector, - int32_t count, - struct iatt *stbuf, - struct iobref *iobref) -{ - call_frame_t *prev = NULL; - prev = cookie; - - map_itransform (this, prev->this, stbuf->ia_ino, &stbuf->ia_ino); - - STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf, iobref); - return 0; -} - -static int32_t -map_writev_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - call_frame_t *prev = NULL; - prev = cookie; - - map_itransform (this, prev->this, postbuf->ia_ino, &postbuf->ia_ino); - - STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf); - return 0; -} - -static int32_t -map_flush_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - -static int32_t -map_fsync_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf); - return 0; -} - - -static int32_t -map_fstat_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf) -{ - call_frame_t *prev = NULL; - prev = cookie; - - map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino); - - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - - -static int32_t -map_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entries, - int32_t count) -{ - STACK_UNWIND (frame, op_ret, op_errno, entries, count); - return 0; -} - - -static int32_t -map_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -static int32_t -map_fsyncdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - -static int32_t -map_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - -static int32_t -map_fsetxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - -static int32_t -map_fgetxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *dict) -{ - STACK_UNWIND (frame, op_ret, op_errno, dict); - return 0; -} - - - -static int32_t -map_getxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *dict) -{ - STACK_UNWIND (frame, op_ret, op_errno, dict); - return 0; -} - -int32_t -map_xattrop_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *dict) -{ - STACK_UNWIND (frame, op_ret, op_errno, dict); - return 0; -} - -int32_t -map_fxattrop_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *dict) -{ - STACK_UNWIND (frame, op_ret, op_errno, dict); - return 0; -} - -static int32_t -map_removexattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - -static int32_t -map_lk_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct gf_flock *lock) -{ - STACK_UNWIND (frame, op_ret, op_errno, lock); - return 0; -} - - -static int32_t -map_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - - -static int32_t -map_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - -static int32_t -map_entrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - -static int32_t -map_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -static int32_t -map_newentry_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - call_frame_t *prev = NULL; - prev = cookie; - - map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino); - - STACK_UNWIND (frame, op_ret, op_errno, inode, buf); - return 0; - -} - - -static int32_t -map_create_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - call_frame_t *prev = NULL; - prev = cookie; - - map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino); - - STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); - return 0; -} - - -/* - * map_normalize_stats - - */ -void -map_normalize_stats (struct statvfs *buf, - unsigned long bsize, - unsigned long frsize) -{ - double factor; - - if (buf->f_bsize != bsize) { - factor = ((double) buf->f_bsize) / bsize; - buf->f_bsize = bsize; - buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree); - buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail); - } - - if (buf->f_frsize != frsize) { - factor = ((double) buf->f_frsize) / frsize; - buf->f_frsize = frsize; - buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks); - } -} - - -int32_t -map_statfs_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct statvfs *stbuf) -{ - struct statvfs *dict_buf = NULL; - map_local_t *local = NULL; - int this_call_cnt = 0; - unsigned long bsize; - unsigned long frsize; - - local = frame->local; - - LOCK (&frame->lock); - { - this_call_cnt = --local->call_count; - - if (op_ret == -1) { - local->op_errno = op_errno; - goto unlock; - } - local->op_ret = 0; - - /* when a call is successfull, add it to local->dict */ - dict_buf = &local->statvfs; - - if (dict_buf->f_bsize != 0) { - bsize = max (dict_buf->f_bsize, - stbuf->f_bsize); - - frsize = max (dict_buf->f_frsize, - stbuf->f_frsize); - map_normalize_stats(dict_buf, bsize, frsize); - map_normalize_stats(stbuf, bsize, frsize); - } else { - dict_buf->f_bsize = stbuf->f_bsize; - dict_buf->f_frsize = stbuf->f_frsize; - } - - dict_buf->f_blocks += stbuf->f_blocks; - dict_buf->f_bfree += stbuf->f_bfree; - dict_buf->f_bavail += stbuf->f_bavail; - dict_buf->f_files += stbuf->f_files; - dict_buf->f_ffree += stbuf->f_ffree; - dict_buf->f_favail += stbuf->f_favail; - dict_buf->f_fsid = stbuf->f_fsid; - dict_buf->f_flag = stbuf->f_flag; - dict_buf->f_namemax = stbuf->f_namemax; - } -unlock: - UNLOCK (&frame->lock); - - if (!this_call_cnt) { - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->statvfs); - } - - return 0; -} - -int32_t -map_single_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - call_frame_t *prev = NULL; - prev = cookie; - - map_itransform (this, prev->this, buf->ia_ino, &buf->ia_ino); - - STACK_UNWIND (frame, op_ret, op_errno, inode, buf, dict); - - return 0; -} - -int32_t -map_root_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - int callcnt = 0; - map_local_t *local = NULL; - inode_t *tmp_inode = NULL; - dict_t *tmp_dict = NULL; - - local = frame->local; - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if ((op_ret == 0) && (local->op_ret == -1)) { - local->op_ret = 0; - local->stbuf = *buf; - if (dict) - local->dict = dict_ref (dict); - local->inode = inode_ref (inode); - } - if (op_ret == -1) - local->op_errno = op_errno; - - } - UNLOCK (&frame->lock); - - if (!callcnt) { - tmp_dict = local->dict; - tmp_inode = local->inode; - - STACK_UNWIND (frame, local->op_ret, - local->op_errno, local->inode, - &local->stbuf, local->dict); - - inode_unref (local->inode); - if (tmp_dict) - dict_unref (tmp_dict); - } - - return 0; -} - - -int32_t -map_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - int callcnt = 0; - map_local_t *local = NULL; - fd_t *local_fd = NULL; - - local = frame->local; - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - local->op_errno = op_errno; - goto unlock; - } - - local->op_ret = 0; - } - unlock: - UNLOCK (&frame->lock); - - if (!callcnt) { - local_fd = local->fd; - local->fd = NULL; - - STACK_UNWIND (frame, local->op_ret, - local->op_errno, local_fd); - - fd_unref (local_fd); - } - return 0; -} - -int32_t -map_single_readdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - gf_dirent_t *entries) -{ - call_frame_t *prev = NULL; - gf_dirent_t *orig_entry = NULL; - - prev = cookie; - - list_for_each_entry (orig_entry, &entries->list, list) { - map_itransform (this, prev->this, orig_entry->d_ino, - &orig_entry->d_ino); - } - STACK_UNWIND (frame, op_ret, op_errno, entries); - - return 0; -} - - -int32_t -map_single_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries) -{ - call_frame_t *prev = NULL; - gf_dirent_t *orig_entry = NULL; - - prev = cookie; - - list_for_each_entry (orig_entry, &entries->list, list) { - map_itransform (this, prev->this, orig_entry->d_ino, - &orig_entry->d_ino); - orig_entry->d_stat.ia_ino = orig_entry->d_ino; - } - STACK_UNWIND (frame, op_ret, op_errno, entries); - return 0; -} - -int -map_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *orig_entries); - -int -map_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *orig_entries); - -int -map_generic_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *orig_entries, - int whichop) -{ - map_local_t *local = NULL; - gf_dirent_t entries; - gf_dirent_t *orig_entry = NULL; - gf_dirent_t *entry = NULL; - call_frame_t *prev = NULL; - xlator_t *subvol = NULL; - xlator_t *next_subvol = NULL; - off_t next_offset = 0; - int count = 0; - fd_t *local_fd = NULL; - - INIT_LIST_HEAD (&entries.list); - prev = cookie; - local = frame->local; - - if (op_ret < 0) - goto done; - - list_for_each_entry (orig_entry, &orig_entries->list, list) { - subvol = prev->this; - - entry = gf_dirent_for_name (orig_entry->d_name); - if (!entry) { - gf_log (this->name, GF_LOG_ERROR, - "memory allocation failed :("); - goto unwind; - } - - map_itransform (this, subvol, orig_entry->d_ino, - &entry->d_ino); - map_itransform (this, subvol, orig_entry->d_off, - &entry->d_off); - - if (whichop == GF_FOP_READDIRP) - entry->d_stat.ia_ino = entry->d_ino; - entry->d_type = orig_entry->d_type; - entry->d_len = orig_entry->d_len; - - list_add_tail (&entry->list, &entries.list); - count++; - next_offset = orig_entry->d_off; - } - - op_ret = count; - -done: - if (count == 0) { - /* non-zero next_offset means that - EOF is not yet hit on the current subvol - */ - if (next_offset == 0) { - next_subvol = map_subvol_next (this, prev->this); - } else { - next_subvol = prev->this; - } - - if (!next_subvol) { - goto unwind; - } - - if (whichop == GF_FOP_READDIR) - STACK_WIND (frame, map_readdir_cbk, next_subvol, - next_subvol->fops->readdir, local->fd, - local->size, 0); - else - STACK_WIND (frame, map_readdirp_cbk, next_subvol, - next_subvol->fops->readdirp, local->fd, - local->size, 0); - return 0; - } - -unwind: - if (op_ret < 0) - op_ret = 0; - - local_fd = local->fd; - local->fd = NULL; - - STACK_UNWIND (frame, op_ret, op_errno, &entries); - - fd_unref (local_fd); - - gf_dirent_free (&entries); - - return 0; -} - - -int -map_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *orig_entries) -{ - map_generic_readdir_cbk (frame, cookie, this, op_ret, op_errno, - orig_entries, GF_FOP_READDIR); - return 0; -} - - -int -map_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *orig_entries) -{ - map_generic_readdir_cbk (frame, cookie, this, op_ret, op_errno, - orig_entries, GF_FOP_READDIRP); - return 0; -} - - -/* Management operations */ - -static int32_t -map_checksum_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - uint8_t *file_checksum, - uint8_t *dir_checksum) -{ - STACK_UNWIND (frame, op_ret, op_errno, file_checksum, dir_checksum); - return 0; -} - - -/* Fops starts here */ - -int32_t -map_stat (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_stat_cbk, subvol, subvol->fops->stat, loc); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_setattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - struct iatt *stbuf, - int32_t valid) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - GF_VALIDATE_OR_GOTO ("map", this, err); - GF_VALIDATE_OR_GOTO (this->name, frame, err); - GF_VALIDATE_OR_GOTO (this->name, loc, err); - GF_VALIDATE_OR_GOTO (this->name, loc->inode, err); - GF_VALIDATE_OR_GOTO (this->name, loc->path, err); - GF_VALIDATE_OR_GOTO (this->name, stbuf, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_setattr_cbk, subvol, - subvol->fops->setattr, loc, stbuf, valid); - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_fsetattr (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iatt *stbuf, - int32_t valid) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - GF_VALIDATE_OR_GOTO ("map", this, err); - GF_VALIDATE_OR_GOTO (this->name, frame, err); - GF_VALIDATE_OR_GOTO (this->name, fd, err); - GF_VALIDATE_OR_GOTO (this->name, stbuf, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_fsetattr_cbk, subvol, - subvol->fops->fsetattr, fd, stbuf, valid); - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_truncate (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - off_t offset) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_truncate_cbk, subvol, - subvol->fops->truncate, loc, offset); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_ftruncate (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - off_t offset) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_ftruncate_cbk, subvol, - subvol->fops->ftruncate, fd, offset); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_access (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t mask) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_access_cbk, subvol, - subvol->fops->access, loc, mask); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int32_t -map_readlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - size_t size) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_readlink_cbk, subvol, - subvol->fops->readlink, loc, size); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_unlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_unlink_cbk, subvol, subvol->fops->unlink, loc); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_rmdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_rmdir_cbk, subvol, subvol->fops->rmdir, loc); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int32_t -map_rename (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - int32_t op_errno = 1; - xlator_t *old_subvol = NULL; - xlator_t *new_subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (oldloc, err); - VALIDATE_OR_GOTO (oldloc->inode, err); - VALIDATE_OR_GOTO (oldloc->path, err); - VALIDATE_OR_GOTO (newloc, err); - - old_subvol = get_mapping_subvol_from_ctx (this, oldloc->inode); - if (!old_subvol) { - op_errno = EINVAL; - goto err; - } - - if (newloc->path) { - new_subvol = get_mapping_subvol_from_path (this, newloc->path); - if (new_subvol && (new_subvol != old_subvol)) { - op_errno = EXDEV; - goto err; - } - } - - STACK_WIND (frame, map_rename_cbk, old_subvol, - old_subvol->fops->rename, oldloc, newloc); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int32_t -map_link (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - int32_t op_errno = 1; - xlator_t *old_subvol = NULL; - xlator_t *new_subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (oldloc, err); - VALIDATE_OR_GOTO (oldloc->inode, err); - VALIDATE_OR_GOTO (oldloc->path, err); - VALIDATE_OR_GOTO (newloc, err); - - old_subvol = get_mapping_subvol_from_ctx (this, oldloc->inode); - if (!old_subvol) { - op_errno = EINVAL; - goto err; - } - - if (newloc->path) { - new_subvol = get_mapping_subvol_from_path (this, newloc->path); - if (new_subvol && (new_subvol != old_subvol)) { - op_errno = EXDEV; - goto err; - } - } - - STACK_WIND (frame, map_link_cbk, old_subvol, - old_subvol->fops->link, oldloc, newloc); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int32_t -map_open (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, fd_t *fd, int wbflags) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_open_cbk, subvol, - subvol->fops->open, loc, flags, fd, wbflags); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_readv (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_readv_cbk, subvol, - subvol->fops->readv, fd, size, offset); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL, NULL); - - return 0; -} - -int32_t -map_writev (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iovec *vector, - int32_t count, - off_t off, - struct iobref *iobref) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_writev_cbk, subvol, - subvol->fops->writev, fd, vector, count, off, iobref); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_flush (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_flush_cbk, subvol, subvol->fops->flush, fd); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int32_t -map_fsync (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_fsync_cbk, subvol, - subvol->fops->fsync, fd, flags); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_fstat (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_fstat_cbk, subvol, subvol->fops->fstat, fd); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_getdents (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset, - int32_t flag) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_getdents_cbk, subvol, - subvol->fops->getdents, fd, size, offset, flag); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_setdents (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags, - dir_entry_t *entries, - int32_t count) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_setdents_cbk, subvol, - subvol->fops->setdents, fd, flags, entries, count); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int32_t -map_fsyncdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_fsyncdir_cbk, subvol, - subvol->fops->fsyncdir, fd, flags); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - - - - -int32_t -map_setxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *dict, - int32_t flags) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_setxattr_cbk, subvol, - subvol->fops->setxattr, loc, dict, flags); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_getxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_getxattr_cbk, subvol, - subvol->fops->getxattr, loc, name); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int32_t -map_fsetxattr (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - dict_t *dict, - int32_t flags) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_fsetxattr_cbk, subvol, - subvol->fops->fsetxattr, fd, dict, flags); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_fgetxattr (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - const char *name) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_fgetxattr_cbk, subvol, - subvol->fops->fgetxattr, fd, name); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int32_t -map_xattrop (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - gf_xattrop_flags_t flags, - dict_t *dict) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_xattrop_cbk, subvol, - subvol->fops->xattrop, loc, flags, dict); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_fxattrop (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - gf_xattrop_flags_t flags, - dict_t *dict) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_fxattrop_cbk, subvol, - subvol->fops->fxattrop, fd, flags, dict); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_removexattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_removexattr_cbk, subvol, - subvol->fops->removexattr, loc, name); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_lk (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t cmd, - struct gf_flock *lock) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_lk_cbk, subvol, - subvol->fops->lk, fd, cmd, lock); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int32_t -map_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *lock) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_inodelk_cbk, subvol, - subvol->fops->inodelk, volume, loc, cmd, lock); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int32_t -map_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *lock) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_finodelk_cbk, subvol, - subvol->fops->finodelk, volume, fd, cmd, lock); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_entrylk_cbk, subvol, - subvol->fops->entrylk, volume, loc, basename, cmd, type); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_fentrylk_cbk, subvol, - subvol->fops->fentrylk, volume, fd, basename, cmd, type); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_checksum (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flag) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, map_checksum_cbk, subvol, - subvol->fops->checksum, loc, flag); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int32_t -map_mknod (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode, - dev_t rdev) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - subvol = get_mapping_subvol_from_path (this, loc->path); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol); - if (op_errno != 0) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set subvolume ptr in inode ctx", - loc->path); - } - - STACK_WIND (frame, map_newentry_cbk, subvol, - subvol->fops->mknod, loc, mode, rdev); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_mkdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - subvol = get_mapping_subvol_from_path (this, loc->path); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol); - if (op_errno != 0) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set subvolume ptr in inode ctx", - loc->path); - } - - STACK_WIND (frame, map_newentry_cbk, subvol, - subvol->fops->mkdir, loc, mode); - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int32_t -map_symlink (call_frame_t *frame, - xlator_t *this, - const char *linkpath, - loc_t *loc) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - subvol = get_mapping_subvol_from_path (this, loc->path); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol); - if (op_errno != 0) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set subvolume ptr in inode ctx", - loc->path); - } - - STACK_WIND (frame, map_newentry_cbk, subvol, - subvol->fops->symlink, linkpath, loc); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - -int32_t -map_create (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - mode_t mode, fd_t *fd) -{ - int32_t op_errno = 1; - xlator_t *subvol = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - subvol = get_mapping_subvol_from_path (this, loc->path); - if (!subvol) { - op_errno = EINVAL; - goto err; - } - - op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol); - if (op_errno != 0) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set subvolume ptr in inode ctx", - loc->path); - } - - STACK_WIND (frame, map_create_cbk, subvol, - subvol->fops->create, loc, flags, mode, fd); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int32_t -map_lookup (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *xattr_req) -{ - int32_t op_errno = EINVAL; - xlator_t *subvol = NULL; - map_local_t *local = NULL; - map_private_t *priv = NULL; - xlator_list_t *trav = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - priv = this->private; - - if (loc->inode->ino == 1) - goto root_inode; - - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - subvol = get_mapping_subvol_from_path (this, loc->path); - if (!subvol) { - goto err; - } - - op_errno = inode_ctx_put (loc->inode, this, - (uint64_t)(long)subvol); - if (op_errno != 0) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set subvolume in inode ctx", - loc->path); - } - } - - /* Just one callback */ - STACK_WIND (frame, map_single_lookup_cbk, subvol, - subvol->fops->lookup, loc, xattr_req); - - return 0; - - root_inode: - local = GF_CALLOC (1, sizeof (map_local_t), - gf_map_mt_map_local_t); - - frame->local = local; - local->call_count = priv->child_count; - local->op_ret = -1; - - trav = this->children; - while (trav) { - STACK_WIND (frame, map_root_lookup_cbk, trav->xlator, - trav->xlator->fops->lookup, loc, xattr_req); - trav = trav->next; - } - - return 0; - - err: - STACK_UNWIND (frame, -1, op_errno, NULL, NULL); - - return 0; -} - - -int32_t -map_statfs (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - int32_t op_errno = EINVAL; - xlator_t *subvol = NULL; - map_local_t *local = NULL; - map_private_t *priv = NULL; - xlator_list_t *trav = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - if (loc->inode->ino == 1) - goto root_inode; - subvol = get_mapping_subvol_from_ctx (this, loc->inode); - if (!subvol) { - goto err; - } - - /* Just one callback */ - STACK_WIND (frame, map_statfs_cbk, subvol, subvol->fops->statfs, loc); - - return 0; - - root_inode: - local = GF_CALLOC (1, sizeof (map_local_t), - gf_map_mt_map_local_t); - - priv = this->private; - frame->local = local; - local->call_count = priv->child_count; - local->op_ret = -1; - - trav = this->children; - while (trav) { - STACK_WIND (frame, map_statfs_cbk, trav->xlator, - trav->xlator->fops->statfs, loc); - trav = trav->next; - } - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); - - return 0; -} - -int32_t -map_opendir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, fd_t *fd) -{ - int32_t op_errno = EINVAL; - xlator_t *subvol = NULL; - map_local_t *local = NULL; - map_private_t *priv = NULL; - xlator_list_t *trav = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - if (loc->inode->ino == 1) - goto root_inode; - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - goto err; - } - - /* Just one callback */ - STACK_WIND (frame, map_opendir_cbk, subvol, - subvol->fops->opendir, loc, fd); - - return 0; - - root_inode: - local = GF_CALLOC (1, sizeof (map_local_t), - gf_map_mt_map_local_t); - - priv = this->private; - frame->local = local; - local->call_count = priv->child_count; - local->op_ret = -1; - local->fd = fd_ref (fd); - - trav = this->children; - while (trav) { - STACK_WIND (frame, map_opendir_cbk, trav->xlator, - trav->xlator->fops->opendir, loc, fd); - trav = trav->next; - } - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); - - return 0; -} - - -int32_t -map_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff, int whichop) -{ - int32_t op_errno = EINVAL; - xlator_t *subvol = NULL; - map_local_t *local = NULL; - map_private_t *priv = NULL; - xlator_t *xvol = NULL; - off_t xoff = 0; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - if (fd->inode->ino == 1) - goto root_inode; - - subvol = get_mapping_subvol_from_ctx (this, fd->inode); - if (!subvol) { - goto err; - } - - /* Just one callback */ - if (whichop == GF_FOP_READDIR) - STACK_WIND (frame, map_single_readdir_cbk, subvol, - subvol->fops->readdir, fd, size, yoff); - else - STACK_WIND (frame, map_single_readdirp_cbk, subvol, - subvol->fops->readdirp, fd, size, yoff); - - return 0; - - root_inode: - /* readdir on '/' */ - local = GF_CALLOC (1, sizeof (map_local_t), - gf_map_mt_map_local_t); - if (!local) { - gf_log (this->name, GF_LOG_ERROR, - "memory allocation failed :("); - op_errno = ENOMEM; - goto err; - } - - priv = this->private; - frame->local = local; - local->op_errno = ENOENT; - local->op_ret = -1; - - local->fd = fd_ref (fd); - local->size = size; - - map_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); - - if (whichop == GF_FOP_READDIR) - STACK_WIND (frame, map_readdir_cbk, xvol, xvol->fops->readdir, - fd, size, xoff); - else - STACK_WIND (frame, map_readdirp_cbk, xvol, xvol->fops->readdirp, - fd, size, xoff); - - return 0; - err: - STACK_UNWIND (frame, -1, op_errno, NULL); - - return 0; -} - - - -int32_t -map_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff) -{ - map_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIR); - return 0; -} - -int32_t -map_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff) -{ - map_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP); - return 0; -} - - -void -fini (xlator_t *this) -{ - map_private_t *priv = NULL; - struct map_pattern *trav_map = NULL; - struct map_pattern *tmp_map = NULL; - - priv = this->private; - - if (priv) { - if (priv->xlarray) - GF_FREE (priv->xlarray); - - trav_map = priv->map; - while (trav_map) { - tmp_map = trav_map; - trav_map = trav_map->next; - GF_FREE (tmp_map); - } - - GF_FREE(priv); - } - - return; -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_map_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; -} - -int -init (xlator_t *this) -{ - map_private_t *priv = NULL; - xlator_list_t *trav = NULL; - int count = 0; - int ret = -1; - char *pattern_string = NULL; - char *map_pair_str = NULL; - char *tmp_str = NULL; - char *tmp_str1 = NULL; - char *dup_map_pair = NULL; - char *dir_str = NULL; - char *subvol_str = NULL; - char *map_xl = NULL; - - - if (!this->children) { - gf_log (this->name,GF_LOG_ERROR, - "FATAL: map should have one or more child defined"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - priv = GF_CALLOC (1, sizeof (map_private_t), - gf_map_mt_map_private_t); - this->private = priv; - - /* allocate xlator array */ - trav = this->children; - while (trav) { - count++; - trav = trav->next; - } - priv->xlarray = GF_CALLOC (1, sizeof (struct map_xlator_array) * count, - gf_map_mt_map_xlator_array); - priv->child_count = count; - - /* build xlator array */ - count = 0; - trav = this->children; - while (trav) { - priv->xlarray[count++].xl = trav->xlator; - trav = trav->next; - } - - /* map dir1:brick1;dir2:brick2;dir3:brick3;*:brick4 */ - ret = dict_get_str (this->options, "map-directory", &pattern_string); - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, - "map.pattern not given, can't continue"); - goto err; - } - map_pair_str = strtok_r (pattern_string, ";", &tmp_str); - while (map_pair_str) { - dup_map_pair = gf_strdup (map_pair_str); - dir_str = strtok_r (dup_map_pair, ":", &tmp_str1); - if (!dir_str) { - gf_log (this->name, GF_LOG_ERROR, - "directory string invalid"); - goto err; - } - subvol_str = strtok_r (NULL, ":", &tmp_str1); - if (!subvol_str) { - gf_log (this->name, GF_LOG_ERROR, - "mapping subvolume string invalid"); - goto err; - } - ret = verify_dir_and_assign_subvol (this, - dir_str, - subvol_str); - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, - "verification failed"); - goto err; - } - - GF_FREE (dup_map_pair); - - map_pair_str = strtok_r (NULL, ";", &tmp_str); - } - - /* default-volume brick4 */ - ret = dict_get_str (this->options, "default-volume", &map_xl); - if (ret == 0) { - ret = assign_default_subvol (this, map_xl); - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, - "assigning default failed"); - goto err; - } - } - - verify_if_all_subvolumes_got_used (this); - - return 0; - err: - fini (this); - return -1; -} - - -struct xlator_fops fops = { - .lookup = map_lookup, - .mknod = map_mknod, - .create = map_create, - - .stat = map_stat, - .fstat = map_fstat, - .truncate = map_truncate, - .ftruncate = map_ftruncate, - .access = map_access, - .readlink = map_readlink, - .setxattr = map_setxattr, - .getxattr = map_getxattr, - .fsetxattr = map_fsetxattr, - .fgetxattr = map_fgetxattr, - .removexattr = map_removexattr, - .open = map_open, - .readv = map_readv, - .writev = map_writev, - .flush = map_flush, - .fsync = map_fsync, - .statfs = map_statfs, - .lk = map_lk, - .opendir = map_opendir, - .readdir = map_readdir, - .readdirp = map_readdirp, - .fsyncdir = map_fsyncdir, - .symlink = map_symlink, - .unlink = map_unlink, - .link = map_link, - .mkdir = map_mkdir, - .rmdir = map_rmdir, - .rename = map_rename, - .inodelk = map_inodelk, - .finodelk = map_finodelk, - .entrylk = map_entrylk, - .fentrylk = map_fentrylk, - .xattrop = map_xattrop, - .fxattrop = map_fxattrop, - .setdents = map_setdents, - .getdents = map_getdents, - .checksum = map_checksum, - .setattr = map_setattr, - .fsetattr = map_fsetattr, -}; - -struct xlator_cbks cbks = { -}; - -struct volume_options options[] = { - { .key = {"map-directory"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"default-volume"}, - .type = GF_OPTION_TYPE_XLATOR - }, - - { .key = {NULL} } -}; diff --git a/xlators/cluster/map/src/map.h b/xlators/cluster/map/src/map.h deleted file mode 100644 index 44ba3ee125f..00000000000 --- a/xlators/cluster/map/src/map.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef __MAP_H__ -#define __MAP_H__ - -#include "xlator.h" -#include "map-mem-types.h" - -struct map_pattern { - struct map_pattern *next; - xlator_t *xl; - char *directory; - int dir_len; -}; - -struct map_xlator_array { - xlator_t *xl; - int mapped; /* yes/no */ -}; - -typedef struct { - struct map_pattern *map; - xlator_t *default_xl; - struct map_xlator_array *xlarray; - int child_count; -} map_private_t; - -typedef struct { - int32_t op_ret; - int32_t op_errno; - int call_count; - struct statvfs statvfs; - struct iatt stbuf; - inode_t *inode; - dict_t *dict; - fd_t *fd; - - size_t size; -} map_local_t; - -xlator_t *map_subvol_next (xlator_t *this, xlator_t *prev); -int map_subvol_cnt (xlator_t *this, xlator_t *subvol); - -int map_itransform (xlator_t *this, xlator_t *subvol, - uint64_t x, uint64_t *y_p); -int map_deitransform (xlator_t *this, uint64_t y, - xlator_t **subvol_p, uint64_t *x_p); - - -xlator_t *get_mapping_subvol_from_path (xlator_t *this, const char *path); -xlator_t *get_mapping_subvol_from_ctx (xlator_t *this, inode_t *inode); - -int check_multiple_volume_entry (xlator_t *this, xlator_t *subvol); -int verify_dir_and_assign_subvol (xlator_t *this, - const char *directory, const char *subvol); -int assign_default_subvol (xlator_t *this, const char *default_xl); -void verify_if_all_subvolumes_got_used (xlator_t *this); - - -#endif /* __MAP_H__ */ diff --git a/xlators/cluster/stripe/Makefile.am b/xlators/cluster/stripe/Makefile.am deleted file mode 100644 index d471a3f9243..00000000000 --- a/xlators/cluster/stripe/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = src - -CLEANFILES = diff --git a/xlators/cluster/stripe/src/Makefile.am b/xlators/cluster/stripe/src/Makefile.am deleted file mode 100644 index 0db3c9eeb5a..00000000000 --- a/xlators/cluster/stripe/src/Makefile.am +++ /dev/null @@ -1,17 +0,0 @@ - -xlator_LTLIBRARIES = stripe.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster - -stripe_la_LDFLAGS = -module -avoidversion - -stripe_la_SOURCES = stripe.c $(top_builddir)/xlators/lib/src/libxlator.c -stripe_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -noinst_HEADERS = stripe.h stripe-mem-types.h $(top_builddir)/xlators/lib/src/libxlator.h - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) \ - -I$(top_srcdir)/xlators/lib/src - -CLEANFILES = - diff --git a/xlators/cluster/stripe/src/stripe-mem-types.h b/xlators/cluster/stripe/src/stripe-mem-types.h deleted file mode 100644 index 2d2757e86a8..00000000000 --- a/xlators/cluster/stripe/src/stripe-mem-types.h +++ /dev/null @@ -1,40 +0,0 @@ - -/* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - - -#ifndef __STRIPE_MEM_TYPES_H__ -#define __STRIPE_MEM_TYPES_H__ - -#include "mem-types.h" - -enum gf_stripe_mem_types_ { - gf_stripe_mt_stripe_local_t = gf_common_mt_end + 1, - gf_stripe_mt_iovec, - gf_stripe_mt_readv_replies, - gf_stripe_mt_stripe_fd_ctx_t, - gf_stripe_mt_char, - gf_stripe_mt_int8_t, - gf_stripe_mt_xlator_t, - gf_stripe_mt_stripe_private_t, - gf_stripe_mt_stripe_options, - gf_stripe_mt_end -}; -#endif - diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c deleted file mode 100644 index 64a26214e76..00000000000 --- a/xlators/cluster/stripe/src/stripe.c +++ /dev/null @@ -1,4080 +0,0 @@ -/* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -/** - * xlators/cluster/stripe: - * Stripe translator, stripes the data accross its child nodes, - * as per the options given in the volfile. The striping works - * fairly simple. It writes files at different offset as per - * calculation. So, 'ls -l' output at the real posix level will - * show file size bigger than the actual size. But when one does - * 'df' or 'du <file>', real size of the file on the server is shown. - * - * WARNING: - * Stripe translator can't regenerate data if a child node gets disconnected. - * So, no 'self-heal' for stripe. Hence the advice, use stripe only when its - * very much necessary, or else, use it in combination with AFR, to have a - * backup copy. - */ - -#include "stripe.h" -#include "libxlator.h" - -void -stripe_local_wipe (stripe_local_t *local) -{ - if (!local) - goto out; - - loc_wipe (&local->loc); - loc_wipe (&local->loc2); -out: - return; -} - -/** - * stripe_get_matching_bs - Get the matching block size for the given path. - */ -int32_t -stripe_get_matching_bs (const char *path, struct stripe_options *opts, - uint64_t default_bs) -{ - struct stripe_options *trav = NULL; - char *pathname = NULL; - uint64_t block_size = 0; - - block_size = default_bs; - - if (!path || !opts) - goto out; - - /* FIXME: is a strdup really necessary? */ - pathname = gf_strdup (path); - if (!pathname) - goto out; - - trav = opts; - while (trav) { - if (!fnmatch (trav->path_pattern, pathname, FNM_NOESCAPE)) { - block_size = trav->block_size; - break; - } - trav = trav->next; - } - - GF_FREE (pathname); - -out: - return block_size; -} - - - -int32_t -stripe_sh_chown_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) -{ - int callcnt = -1; - stripe_local_t *local = NULL; - - if (!this || !frame || !frame->local) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - STRIPE_STACK_DESTROY (frame); - } -out: - return 0; -} - -int32_t -stripe_sh_make_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) -{ - stripe_local_t *local = NULL; - call_frame_t *prev = NULL; - - if (!frame || !frame->local || !cookie || !this) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - local = frame->local; - - STACK_WIND (frame, stripe_sh_chown_cbk, prev->this, - prev->this->fops->setattr, &local->loc, - &local->stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID)); - -out: - return 0; -} - -int32_t -stripe_entry_self_heal (call_frame_t *frame, xlator_t *this, - stripe_local_t *local) -{ - xlator_list_t *trav = NULL; - call_frame_t *rframe = NULL; - stripe_local_t *rlocal = NULL; - stripe_private_t *priv = NULL; - dict_t *dict = NULL; - int ret = 0; - - if (!local || !this || !frame) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - if (!(IA_ISREG (local->stbuf.ia_type) || - IA_ISDIR (local->stbuf.ia_type))) - return 0; - - priv = this->private; - trav = this->children; - rframe = copy_frame (frame); - if (!rframe) { - goto out; - } - rlocal = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!rlocal) { - goto out; - } - rframe->local = rlocal; - rlocal->call_count = priv->child_count; - loc_copy (&rlocal->loc, &local->loc); - memcpy (&rlocal->stbuf, &local->stbuf, sizeof (struct iatt)); - - dict = dict_new (); - if (!dict) - goto out; - - ret = dict_set_static_bin (dict, "gfid-req", local->stbuf.ia_gfid, 16); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "failed to set gfid-req"); - - while (trav) { - if (IA_ISREG (local->stbuf.ia_type)) { - STACK_WIND (rframe, stripe_sh_make_entry_cbk, - trav->xlator, trav->xlator->fops->mknod, - &local->loc, - st_mode_from_ia (local->stbuf.ia_prot, - local->stbuf.ia_type), 0, - dict); - } - if (IA_ISDIR (local->stbuf.ia_type)) { - STACK_WIND (rframe, stripe_sh_make_entry_cbk, - trav->xlator, trav->xlator->fops->mkdir, - &local->loc, st_mode_from_ia (local->stbuf.ia_prot, - local->stbuf.ia_type), - dict); - } - trav = trav->next; - } - -out: - if (rframe) - STRIPE_STACK_DESTROY (rframe); - if (dict) - dict_unref (dict); - - return 0; -} - -int32_t -stripe_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *dict, struct iatt *postparent) -{ - int32_t callcnt = 0; - dict_t *tmp_dict = NULL; - inode_t *tmp_inode = NULL; - stripe_local_t *local = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - if (op_errno != ENOENT) - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, - strerror (op_errno)); - if (local->op_errno != ESTALE) - local->op_errno = op_errno; - if (((op_errno != ENOENT) && (op_errno != ENOTCONN)) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - if (op_errno == ENOENT) - local->entry_self_heal_needed = 1; - } - - if (op_ret >= 0) { - local->op_ret = 0; - - if (FIRST_CHILD(this) == prev->this) { - local->stbuf = *buf; - local->postparent = *postparent; - local->inode = inode_ref (inode); - local->dict = dict_ref (dict); - } - local->stbuf_blocks += buf->ia_blocks; - local->postparent_blocks += postparent->ia_blocks; - - if (local->stbuf_size < buf->ia_size) - local->stbuf_size = buf->ia_size; - if (local->postparent_size < postparent->ia_size) - local->postparent_size = postparent->ia_size; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->op_ret == 0 && local->entry_self_heal_needed) - stripe_entry_self_heal (frame, this, local); - - if (local->failed) - local->op_ret = -1; - - tmp_dict = local->dict; - tmp_inode = local->inode; - - if (local->op_ret != -1) { - local->stbuf.ia_blocks = local->stbuf_blocks; - local->stbuf.ia_size = local->stbuf_size; - local->postparent.ia_blocks = local->postparent_blocks; - local->postparent.ia_size = local->postparent_size; - } - - STRIPE_STACK_UNWIND (lookup, frame, local->op_ret, - local->op_errno, local->inode, - &local->stbuf, local->dict, - &local->postparent); - - if (tmp_inode) - inode_unref (tmp_inode); - if (tmp_dict) - dict_unref (tmp_dict); - } -out: - return 0; -} - -int32_t -stripe_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xattr_req) -{ - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - stripe_private_t *priv = NULL; - int32_t op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - priv = this->private; - trav = this->children; - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - loc_copy (&local->loc, loc); - - /* Everytime in stripe lookup, all child nodes - should be looked up */ - local->call_count = priv->child_count; - while (trav) { - STACK_WIND (frame, stripe_lookup_cbk, trav->xlator, - trav->xlator->fops->lookup, - loc, xattr_req); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); - return 0; -} - - -int32_t -stripe_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - prev = cookie; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - } - - if (op_ret == 0) { - local->op_ret = 0; - - if (FIRST_CHILD(this) == prev->this) { - local->stbuf = *buf; - } - - local->stbuf_blocks += buf->ia_blocks; - if (local->stbuf_size < buf->ia_size) - local->stbuf_size = buf->ia_size; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - if (local->op_ret != -1) { - local->stbuf.ia_size = local->stbuf_size; - local->stbuf.ia_blocks = local->stbuf_blocks; - } - - STRIPE_STACK_UNWIND (stat, frame, local->op_ret, - local->op_errno, &local->stbuf); - } -out: - return 0; -} - -int32_t -stripe_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) -{ - xlator_list_t *trav = NULL; - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - int32_t op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - priv = this->private; - trav = this->children; - - if (priv->first_child_down) { - op_errno = ENOTCONN; - goto err; - } - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_stat_cbk, trav->xlator, - trav->xlator->fops->stat, loc); - trav = trav->next; - } - - return 0; - -err: - STRIPE_STACK_UNWIND (stat, frame, -1, op_errno, NULL); - return 0; -} - - -int32_t -stripe_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct statvfs *stbuf) -{ - stripe_local_t *local = NULL; - int32_t callcnt = 0; - - if (!this || !frame || !frame->local) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - local = frame->local; - - LOCK(&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret && (op_errno != ENOTCONN)) { - local->op_errno = op_errno; - } - if (op_ret == 0) { - struct statvfs *dict_buf = &local->statvfs_buf; - dict_buf->f_bsize = stbuf->f_bsize; - dict_buf->f_frsize = stbuf->f_frsize; - dict_buf->f_blocks += stbuf->f_blocks; - dict_buf->f_bfree += stbuf->f_bfree; - dict_buf->f_bavail += stbuf->f_bavail; - dict_buf->f_files += stbuf->f_files; - dict_buf->f_ffree += stbuf->f_ffree; - dict_buf->f_favail += stbuf->f_favail; - dict_buf->f_fsid = stbuf->f_fsid; - dict_buf->f_flag = stbuf->f_flag; - dict_buf->f_namemax = stbuf->f_namemax; - local->op_ret = 0; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - STRIPE_STACK_UNWIND (statfs, frame, local->op_ret, - local->op_errno, &local->statvfs_buf); - } -out: - return 0; -} - -int32_t -stripe_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) -{ - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - stripe_private_t *priv = NULL; - int32_t op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - - trav = this->children; - priv = this->private; - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - local->op_errno = ENOTCONN; - frame->local = local; - - local->call_count = priv->child_count; - while (trav) { - STACK_WIND (frame, stripe_statfs_cbk, trav->xlator, - trav->xlator->fops->statfs, loc); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (statfs, frame, -1, op_errno, NULL); - return 0; -} - - - -int32_t -stripe_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - } - - if (op_ret == 0) { - local->op_ret = 0; - if (FIRST_CHILD(this) == prev->this) { - local->pre_buf = *prebuf; - local->post_buf = *postbuf; - } - - local->prebuf_blocks += prebuf->ia_blocks; - local->postbuf_blocks += postbuf->ia_blocks; - - if (local->prebuf_size < prebuf->ia_size) - local->prebuf_size = prebuf->ia_size; - - if (local->postbuf_size < postbuf->ia_size) - local->postbuf_size = postbuf->ia_size; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - if (local->op_ret != -1) { - local->pre_buf.ia_blocks = local->prebuf_blocks; - local->pre_buf.ia_size = local->prebuf_size; - local->post_buf.ia_blocks = local->postbuf_blocks; - local->post_buf.ia_size = local->postbuf_size; - } - - STRIPE_STACK_UNWIND (truncate, frame, local->op_ret, - local->op_errno, &local->pre_buf, - &local->post_buf); - } -out: - return 0; -} - -int32_t -stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) -{ - xlator_list_t *trav = NULL; - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - int32_t op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - priv = this->private; - trav = this->children; - - if (priv->first_child_down) { - op_errno = ENOTCONN; - goto err; - } - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_truncate_cbk, trav->xlator, - trav->xlator->fops->truncate, loc, offset); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int32_t -stripe_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - } - - if (op_ret == 0) { - local->op_ret = 0; - - if (FIRST_CHILD(this) == prev->this) { - local->pre_buf = *preop; - local->post_buf = *postop; - } - - local->prebuf_blocks += preop->ia_blocks; - local->postbuf_blocks += postop->ia_blocks; - - if (local->prebuf_size < preop->ia_size) - local->prebuf_size = preop->ia_size; - if (local->postbuf_size < postop->ia_size) - local->postbuf_size = postop->ia_size; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - if (local->op_ret != -1) { - local->pre_buf.ia_blocks = local->prebuf_blocks; - local->pre_buf.ia_size = local->prebuf_size; - local->post_buf.ia_blocks = local->postbuf_blocks; - local->post_buf.ia_size = local->postbuf_size; - } - - STRIPE_STACK_UNWIND (setattr, frame, local->op_ret, - local->op_errno, &local->pre_buf, - &local->post_buf); - } -out: - return 0; -} - - -int32_t -stripe_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid) -{ - xlator_list_t *trav = NULL; - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - int32_t op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - priv = this->private; - trav = this->children; - - if (priv->first_child_down) { - op_errno = ENOTCONN; - goto err; - } - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_setattr_cbk, - trav->xlator, trav->xlator->fops->setattr, - loc, stbuf, valid); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int32_t -stripe_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid) -{ - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - priv = this->private; - trav = this->children; - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_setattr_cbk, trav->xlator, - trav->xlator->fops->fsetattr, fd, stbuf, valid); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL); - return 0; -} - -int32_t -stripe_stack_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - } - - if (op_ret == 0) { - local->op_ret = 0; - - local->stbuf.ia_blocks += buf->ia_blocks; - local->preparent.ia_blocks += preoldparent->ia_blocks; - local->postparent.ia_blocks += postoldparent->ia_blocks; - local->pre_buf.ia_blocks += prenewparent->ia_blocks; - local->post_buf.ia_blocks += postnewparent->ia_blocks; - - if (local->stbuf.ia_size < buf->ia_size) - local->stbuf.ia_size = buf->ia_size; - - if (local->preparent.ia_size < preoldparent->ia_size) - local->preparent.ia_size = preoldparent->ia_size; - - if (local->postparent.ia_size < postoldparent->ia_size) - local->postparent.ia_size = postoldparent->ia_size; - - if (local->pre_buf.ia_size < prenewparent->ia_size) - local->pre_buf.ia_size = prenewparent->ia_size; - - if (local->post_buf.ia_size < postnewparent->ia_size) - local->post_buf.ia_size = postnewparent->ia_size; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - STRIPE_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preparent, - &local->postparent, &local->pre_buf, - &local->post_buf); - } -out: - return 0; -} - -int32_t -stripe_first_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) -{ - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - - if (!this || !frame || !frame->local) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - op_errno = EINVAL; - goto unwind; - } - - if (op_ret == -1) { - goto unwind; - } - - local = frame->local; - trav = this->children; - - local->stbuf = *buf; - local->preparent = *preoldparent; - local->postparent = *postoldparent; - local->pre_buf = *prenewparent; - local->post_buf = *postnewparent; - - local->op_ret = 0; - local->call_count--; - - trav = trav->next; /* Skip first child */ - while (trav) { - STACK_WIND (frame, stripe_stack_rename_cbk, - trav->xlator, trav->xlator->fops->rename, - &local->loc, &local->loc2); - trav = trav->next; - } - return 0; - -unwind: - STRIPE_STACK_UNWIND (rename, frame, -1, op_errno, buf, preoldparent, - postoldparent, prenewparent, postnewparent); - return 0; -} - -int32_t -stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc) -{ - stripe_private_t *priv = NULL; - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (oldloc, err); - VALIDATE_OR_GOTO (oldloc->path, err); - VALIDATE_OR_GOTO (oldloc->inode, err); - VALIDATE_OR_GOTO (newloc, err); - - priv = this->private; - trav = this->children; - - /* If any one node is down, don't allow rename */ - if (priv->nodes_down) { - op_errno = ENOTCONN; - goto err; - } - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - loc_copy (&local->loc, oldloc); - loc_copy (&local->loc2, newloc); - - local->call_count = priv->child_count; - - frame->local = local; - - STACK_WIND (frame, stripe_first_rename_cbk, trav->xlator, - trav->xlator->fops->rename, oldloc, newloc); - - return 0; -err: - STRIPE_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL); - return 0; -} - - - -int32_t -stripe_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, "%s returned %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - } - if (op_ret >= 0) { - local->op_ret = op_ret; - if (FIRST_CHILD(this) == prev->this) { - local->preparent = *preparent; - local->postparent = *postparent; - } - local->preparent_blocks += preparent->ia_blocks; - local->postparent_blocks += postparent->ia_blocks; - - if (local->preparent_size < preparent->ia_size) - local->preparent_size = preparent->ia_size; - - if (local->postparent_size < postparent->ia_size) - local->postparent_size = postparent->ia_size; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - if (local->op_ret != -1) { - local->preparent.ia_blocks = local->preparent_blocks; - local->preparent.ia_size = local->preparent_size; - local->postparent.ia_blocks = local->postparent_blocks; - local->postparent.ia_size = local->postparent_size; - } - - STRIPE_STACK_UNWIND (unlink, frame, local->op_ret, - local->op_errno, &local->preparent, - &local->postparent); - } -out: - return 0; -} - -int32_t -stripe_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) -{ - xlator_list_t *trav = NULL; - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - int32_t op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - priv = this->private; - trav = this->children; - - if (priv->first_child_down) { - op_errno = ENOTCONN; - goto err; - } - - /* Don't unlink a file if a node is down */ - if (priv->nodes_down) { - op_errno = ENOTCONN; - goto err; - } - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_unlink_cbk, - trav->xlator, trav->xlator->fops->unlink, - loc); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int32_t -stripe_first_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno,struct iatt *preparent, - struct iatt *postparent) - -{ - xlator_list_t *trav = NULL; - stripe_local_t *local = NULL; - - if (!this || !frame || !frame->local) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - op_errno = EINVAL; - goto err; - } - - if (op_ret == -1) { - goto err; - } - - trav = this->children; - local = frame->local; - - local->call_count--; /* First child successful */ - trav = trav->next; /* Skip first child */ - - local->preparent = *preparent; - local->postparent = *postparent; - local->preparent_size = preparent->ia_size; - local->postparent_size = postparent->ia_size; - local->preparent_blocks += preparent->ia_blocks; - local->postparent_blocks += postparent->ia_blocks; - - while (trav) { - STACK_WIND (frame, stripe_unlink_cbk, trav->xlator, - trav->xlator->fops->rmdir, &local->loc, - local->flags); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (rmdir, frame, op_ret, op_errno, NULL, NULL); - return 0; - -} - -int32_t -stripe_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) -{ - xlator_list_t *trav = NULL; - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - int32_t op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - priv = this->private; - trav = this->children; - - /* don't delete a directory if any of the subvolume is down */ - if (priv->nodes_down) { - op_errno = ENOTCONN; - goto err; - } - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - loc_copy (&local->loc, loc); - local->flags = flags; - local->call_count = priv->child_count; - - STACK_WIND (frame, stripe_first_rmdir_cbk, trav->xlator, - trav->xlator->fops->rmdir, loc, flags); - - return 0; -err: - STRIPE_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int32_t -stripe_mknod_ifreg_fail_unlink_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - - if (!this || !frame || !frame->local) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - STRIPE_STACK_UNWIND (mknod, frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, - &local->preparent, &local->postparent); - } -out: - return 0; -} - - -/** - */ -int32_t -stripe_mknod_ifreg_setxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - priv = this->private; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_ret = -1; - local->op_errno = op_errno; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->op_ret == -1) { - local->call_count = priv->child_count; - while (trav) { - STACK_WIND (frame, - stripe_mknod_ifreg_fail_unlink_cbk, - trav->xlator, - trav->xlator->fops->unlink, - &local->loc); - trav = trav->next; - } - return 0; - } - - STRIPE_STACK_UNWIND (mknod, frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, - &local->preparent, &local->postparent); - } -out: - return 0; -} - -int32_t -stripe_mknod_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) -{ - int ret = 0; - int32_t callcnt = 0; - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - priv = this->private; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - local->op_errno = op_errno; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - - if (FIRST_CHILD(this) == prev->this) { - local->stbuf = *buf; - local->preparent = *preparent; - local->postparent = *postparent; - } - - local->stbuf_blocks += buf->ia_blocks; - local->preparent_blocks += preparent->ia_blocks; - local->postparent_blocks += postparent->ia_blocks; - - if (local->stbuf_size < buf->ia_size) - local->stbuf_size = buf->ia_size; - if (local->preparent_size < preparent->ia_size) - local->preparent_size = preparent->ia_size; - if (local->postparent_size < postparent->ia_size) - local->postparent_size = postparent->ia_size; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - if (local->op_ret != -1) { - local->preparent.ia_blocks = local->preparent_blocks; - local->preparent.ia_size = local->preparent_size; - local->postparent.ia_blocks = local->postparent_blocks; - local->postparent.ia_size = local->postparent_size; - local->stbuf.ia_size = local->stbuf_size; - local->stbuf.ia_blocks = local->stbuf_blocks; - } - - if ((local->op_ret != -1) && priv->xattr_supported) { - /* Send a setxattr request to nodes where the - files are created */ - int32_t i = 0; - char size_key[256] = {0,}; - char index_key[256] = {0,}; - char count_key[256] = {0,}; - dict_t *dict = NULL; - - sprintf (size_key, - "trusted.%s.stripe-size", this->name); - sprintf (count_key, - "trusted.%s.stripe-count", this->name); - sprintf (index_key, - "trusted.%s.stripe-index", this->name); - - local->call_count = priv->child_count; - memcpy (local->loc.inode->gfid, local->stbuf.ia_gfid, 16); - for (i = 0; i < priv->child_count; i++) { - dict = get_new_dict (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "failed to allocate dict"); - } - - dict_ref (dict); - /* TODO: check return value */ - ret = dict_set_int64 (dict, size_key, - local->stripe_size); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-size failed", - local->loc.path); - ret = dict_set_int32 (dict, count_key, - priv->child_count); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set child_count failed", - local->loc.path); - ret = dict_set_int32 (dict, index_key, i); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-index failed", - local->loc.path); - - STACK_WIND (frame, - stripe_mknod_ifreg_setxattr_cbk, - priv->xl_array[i], - priv->xl_array[i]->fops->setxattr, - &local->loc, dict, 0); - - dict_unref (dict); - } - return 0; - } - - /* Create itself has failed.. so return - without setxattring */ - STRIPE_STACK_UNWIND (mknod, frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, - &local->preparent, &local->postparent); - } -out: - return 0; -} - - -int32_t -stripe_single_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) -{ - STRIPE_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, buf, - preparent, postparent); - return 0; -} - - -int -stripe_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev, dict_t *params) -{ - stripe_private_t *priv = NULL; - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - priv = this->private; - trav = this->children; - - if (priv->first_child_down) { - op_errno = ENOTCONN; - goto err; - } - - if (S_ISREG(mode)) { - /* NOTE: on older kernels (older than 2.6.9), - creat() fops is sent as mknod() + open(). Hence handling - S_IFREG files is necessary */ - if (priv->nodes_down) { - gf_log (this->name, GF_LOG_WARNING, - "Some node down, returning EIO"); - op_errno = EIO; - goto err; - } - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - local->op_errno = ENOTCONN; - local->stripe_size = stripe_get_matching_bs (loc->path, - priv->pattern, - priv->block_size); - frame->local = local; - local->inode = loc->inode; - loc_copy (&local->loc, loc); - - /* Everytime in stripe lookup, all child nodes should - be looked up */ - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_mknod_ifreg_cbk, - trav->xlator, trav->xlator->fops->mknod, - loc, mode, rdev, params); - trav = trav->next; - } - - /* This case is handled, no need to continue further. */ - return 0; - } - - STACK_WIND (frame, stripe_single_mknod_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, - loc, mode, rdev, params); - - return 0; -err: - STRIPE_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL); - return 0; -} - - -int32_t -stripe_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - inode_t *local_inode = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - } - - if (op_ret >= 0) { - local->op_ret = 0; - - if (FIRST_CHILD(this) == prev->this) { - local->inode = inode_ref (inode); - local->stbuf = *buf; - local->postparent = *postparent; - local->preparent = *preparent; - } - local->stbuf_blocks += buf->ia_blocks; - local->preparent_blocks += preparent->ia_blocks; - local->postparent_blocks += postparent->ia_blocks; - - if (local->stbuf_size < buf->ia_size) - local->stbuf_size = buf->ia_size; - if (local->preparent_size < preparent->ia_size) - local->preparent_size = preparent->ia_size; - if (local->postparent_size < postparent->ia_size) - local->postparent_size = postparent->ia_size; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - local_inode = local->inode; - - if (local->op_ret != -1) { - local->preparent.ia_blocks = local->preparent_blocks; - local->preparent.ia_size = local->preparent_size; - local->postparent.ia_blocks = local->postparent_blocks; - local->postparent.ia_size = local->postparent_size; - local->stbuf.ia_size = local->stbuf_size; - local->stbuf.ia_blocks = local->stbuf_blocks; - } - STRIPE_STACK_UNWIND (mkdir, frame, local->op_ret, - local->op_errno, local->inode, - &local->stbuf, &local->preparent, - &local->postparent); - - if (local_inode) - inode_unref (local_inode); - } -out: - return 0; -} - - -int -stripe_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dict_t *params) -{ - stripe_private_t *priv = NULL; - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = 1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - priv = this->private; - trav = this->children; - - if (priv->first_child_down) { - op_errno = ENOTCONN; - goto err; - } - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - local->call_count = priv->child_count; - frame->local = local; - - /* Everytime in stripe lookup, all child nodes should be looked up */ - while (trav) { - STACK_WIND (frame, stripe_mkdir_cbk, - trav->xlator, trav->xlator->fops->mkdir, - loc, mode, params); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL); - return 0; -} - - -int32_t -stripe_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - inode_t *local_inode = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - } - - if (op_ret >= 0) { - local->op_ret = 0; - - if (FIRST_CHILD(this) == prev->this) { - local->inode = inode_ref (inode); - local->stbuf = *buf; - local->postparent = *postparent; - local->preparent = *preparent; - } - local->stbuf_blocks += buf->ia_blocks; - local->preparent_blocks += preparent->ia_blocks; - local->postparent_blocks += postparent->ia_blocks; - - if (local->stbuf_size < buf->ia_size) - local->stbuf_size = buf->ia_size; - if (local->preparent_size < preparent->ia_size) - local->preparent_size = preparent->ia_size; - if (local->postparent_size < postparent->ia_size) - local->postparent_size = postparent->ia_size; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - local_inode = local->inode; - - if (local->op_ret != -1) { - local->preparent.ia_blocks = local->preparent_blocks; - local->preparent.ia_size = local->preparent_size; - local->postparent.ia_blocks = local->postparent_blocks; - local->postparent.ia_size = local->postparent_size; - local->stbuf.ia_size = local->stbuf_size; - local->stbuf.ia_blocks = local->stbuf_blocks; - } - STRIPE_STACK_UNWIND (link, frame, local->op_ret, - local->op_errno, local->inode, - &local->stbuf, &local->preparent, - &local->postparent); - - if (local_inode) - inode_unref (local_inode); - } -out: - return 0; -} - -int32_t -stripe_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc) -{ - xlator_list_t *trav = NULL; - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - int32_t op_errno = 1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (oldloc, err); - VALIDATE_OR_GOTO (oldloc->path, err); - VALIDATE_OR_GOTO (oldloc->inode, err); - - priv = this->private; - trav = this->children; - - /* If any one node is down, don't allow link operation */ - if (priv->nodes_down) { - op_errno = ENOTCONN; - goto err; - } - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->call_count = priv->child_count; - - /* Everytime in stripe lookup, all child - nodes should be looked up */ - while (trav) { - STACK_WIND (frame, stripe_link_cbk, - trav->xlator, trav->xlator->fops->link, - oldloc, newloc); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL); - return 0; -} - -int32_t -stripe_create_fail_unlink_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - fd_t *lfd = NULL; - stripe_local_t *local = NULL; - inode_t *local_inode = NULL; - - if (!this || !frame || !frame->local) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local_inode = local->inode; - lfd = local->fd; - - STRIPE_STACK_UNWIND (create, frame, local->op_ret, local->op_errno, - local->fd, local->inode, &local->stbuf, - &local->preparent, &local->postparent); - - if (local_inode) - inode_unref (local_inode); - if (lfd) - fd_unref (lfd); - } -out: - return 0; -} - - -int32_t -stripe_create_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - inode_t *local_inode = NULL; - fd_t *lfd = NULL; - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - int32_t callcnt = 0; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - priv = this->private; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_ret = -1; - local->op_errno = op_errno; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->op_ret == -1) { - local->call_count = priv->child_count; - trav = this->children; - while (trav) { - STACK_WIND (frame, - stripe_create_fail_unlink_cbk, - trav->xlator, - trav->xlator->fops->unlink, - &local->loc); - trav = trav->next; - } - - return 0; - } - - lfd = local->fd; - local_inode = local->inode; - - STRIPE_STACK_UNWIND (create, frame, local->op_ret, local->op_errno, - local->fd, local->inode, &local->stbuf, - &local->preparent, &local->postparent); - - if (local_inode) - inode_unref (local_inode); - if (lfd) - fd_unref (lfd); - } -out: - return 0; -} - -int32_t -stripe_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, - inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - fd_t *lfd = NULL; - stripe_fd_ctx_t *fctx = NULL; - inode_t *local_inode = NULL; - call_frame_t *prev = NULL; - int ret = 0; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - priv = this->private; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - local->op_errno = op_errno; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - /* Get the mapping in inode private */ - /* Get the stat buf right */ - if (FIRST_CHILD(this) == prev->this) { - local->stbuf = *buf; - local->preparent = *preparent; - local->postparent = *postparent; - } - - local->stbuf_blocks += buf->ia_blocks; - local->preparent_blocks += preparent->ia_blocks; - local->postparent_blocks += postparent->ia_blocks; - - if (local->stbuf_size < buf->ia_size) - local->stbuf_size = buf->ia_size; - if (local->preparent_size < preparent->ia_size) - local->preparent_size = preparent->ia_size; - if (local->postparent_size < postparent->ia_size) - local->postparent_size = postparent->ia_size; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - if (local->op_ret != -1) { - local->preparent.ia_blocks = local->preparent_blocks; - local->preparent.ia_size = local->preparent_size; - local->postparent.ia_blocks = local->postparent_blocks; - local->postparent.ia_size = local->postparent_size; - local->stbuf.ia_size = local->stbuf_size; - local->stbuf.ia_blocks = local->stbuf_blocks; - } - - /* */ - if (local->op_ret >= 0) { - fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t), - gf_stripe_mt_stripe_fd_ctx_t); - if (!fctx) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } - - fctx->stripe_size = local->stripe_size; - fctx->stripe_count = priv->child_count; - fctx->static_array = 1; - fctx->xl_array = priv->xl_array; - fd_ctx_set (local->fd, this, - (uint64_t)(long)fctx); - } - - if ((local->op_ret != -1) && - local->stripe_size && priv->xattr_supported) { - /* Send a setxattr request to nodes where - the files are created */ - int32_t i = 0; - char size_key[256] = {0,}; - char index_key[256] = {0,}; - char count_key[256] = {0,}; - dict_t *dict = NULL; - - sprintf (size_key, - "trusted.%s.stripe-size", this->name); - sprintf (count_key, - "trusted.%s.stripe-count", this->name); - sprintf (index_key, - "trusted.%s.stripe-index", this->name); - - local->call_count = priv->child_count; - memcpy (local->loc.inode->gfid, local->stbuf.ia_gfid, 16); - for (i = 0; i < priv->child_count; i++) { - dict = get_new_dict (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "error allocating dict"); - } - dict_ref (dict); - - /* TODO: check return values */ - ret = dict_set_int64 (dict, size_key, - local->stripe_size); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-size failed", - local->loc.path); - - ret = dict_set_int32 (dict, count_key, - priv->child_count); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-size failed", - local->loc.path); - - ret = dict_set_int32 (dict, index_key, i); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-size failed", - local->loc.path); - - STACK_WIND (frame, stripe_create_setxattr_cbk, - priv->xl_array[i], - priv->xl_array[i]->fops->setxattr, - &local->loc, dict, 0); - - dict_unref (dict); - } - return 0; - } - -unwind: - /* Create itself has failed.. so return - without setxattring */ - lfd = local->fd; - local_inode = local->inode; - - STRIPE_STACK_UNWIND (create, frame, local->op_ret, - local->op_errno, local->fd, - local->inode, &local->stbuf, - &local->preparent, &local->postparent); - - if (local_inode) - inode_unref (local_inode); - if (lfd) - fd_unref (lfd); - } - -out: - return 0; -} - - -/** - * stripe_create - If a block-size is specified for the 'name', create the - * file in all the child nodes. If not, create it in only first child. - * - * @name- complete path of the file to be created. - */ -int32_t -stripe_create (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, mode_t mode, fd_t *fd, dict_t *params) -{ - stripe_private_t *priv = NULL; - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - priv = this->private; - - /* files created in O_APPEND mode does not allow lseek() on fd */ - flags &= ~O_APPEND; - - if (priv->first_child_down || priv->nodes_down) { - gf_log (this->name, GF_LOG_DEBUG, - "First node down, returning EIO"); - op_errno = EIO; - goto err; - } - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - local->op_errno = ENOTCONN; - local->stripe_size = stripe_get_matching_bs (loc->path, - priv->pattern, - priv->block_size); - frame->local = local; - local->inode = inode_ref (loc->inode); - loc_copy (&local->loc, loc); - local->fd = fd_ref (fd); - - local->call_count = priv->child_count; - - trav = this->children; - while (trav) { - STACK_WIND (frame, stripe_create_cbk, trav->xlator, - trav->xlator->fops->create, loc, flags, - mode, fd, params); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL); - return 0; -} - -int32_t -stripe_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - fd_t *lfd = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - local->op_errno = op_errno; - } - - if (op_ret >= 0) - local->op_ret = op_ret; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - if (local->op_ret == -1) { - if (local->fctx) { - if (!local->fctx->static_array) - GF_FREE (local->fctx->xl_array); - GF_FREE (local->fctx); - } - } else { - fd_ctx_set (local->fd, this, - (uint64_t)(long)local->fctx); - } - - lfd = local->fd; - - STRIPE_STACK_UNWIND (open, frame, local->op_ret, - local->op_errno, local->fd); - if (lfd) - fd_unref (lfd); - - } -out: - return 0; -} - - -int32_t -stripe_open_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) -{ - int32_t index = 0; - int32_t callcnt = 0; - char key[256] = {0,}; - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - stripe_private_t *priv = NULL; - data_t *data = NULL; - call_frame_t *prev = NULL; - fd_t *lfd = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = (call_frame_t *)cookie; - priv = this->private; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_ret = -1; - if (local->op_errno != EIO) - local->op_errno = op_errno; - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - goto unlock; - } - - if (!dict) - goto unlock; - - if (!local->fctx) { - local->fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t), - gf_stripe_mt_stripe_fd_ctx_t); - if (!local->fctx) { - local->op_errno = ENOMEM; - local->op_ret = -1; - goto unlock; - } - - local->fctx->static_array = 0; - } - /* Stripe block size */ - sprintf (key, "trusted.%s.stripe-size", this->name); - data = dict_get (dict, key); - if (!data) { - local->xattr_self_heal_needed = 1; - } else { - if (!local->fctx->stripe_size) { - local->fctx->stripe_size = - data_to_int64 (data); - } - - if (local->fctx->stripe_size != data_to_int64 (data)) { - gf_log (this->name, GF_LOG_WARNING, - "stripe-size mismatch in blocks"); - local->xattr_self_heal_needed = 1; - } - } - /* Stripe count */ - sprintf (key, "trusted.%s.stripe-count", this->name); - data = dict_get (dict, key); - if (!data) { - local->xattr_self_heal_needed = 1; - goto unlock; - } - if (!local->fctx->xl_array) { - local->fctx->stripe_count = data_to_int32 (data); - if (!local->fctx->stripe_count) { - gf_log (this->name, GF_LOG_ERROR, - "error with stripe-count xattr"); - local->op_ret = -1; - local->op_errno = EIO; - goto unlock; - } - - local->fctx->xl_array = - GF_CALLOC (local->fctx->stripe_count, - sizeof (xlator_t *), - gf_stripe_mt_xlator_t); - if (!local->fctx->xl_array) { - local->op_errno = ENOMEM; - local->op_ret = -1; - goto unlock; - } - } - if (local->fctx->stripe_count != data_to_int32 (data)) { - gf_log (this->name, GF_LOG_ERROR, - "error with stripe-count xattr (%d != %d)", - local->fctx->stripe_count, data_to_int32 (data)); - local->op_ret = -1; - local->op_errno = EIO; - goto unlock; - } - - /* index */ - sprintf (key, "trusted.%s.stripe-index", this->name); - data = dict_get (dict, key); - if (!data) { - local->xattr_self_heal_needed = 1; - goto unlock; - } - index = data_to_int32 (data); - if (index > priv->child_count) { - gf_log (this->name, GF_LOG_ERROR, - "error with stripe-index xattr (%d)", index); - local->op_ret = -1; - local->op_errno = EIO; - goto unlock; - } - if (local->fctx->xl_array) { - if (local->fctx->xl_array[index]) { - gf_log (this->name, GF_LOG_ERROR, - "duplicate entry @ index (%d)", index); - local->op_ret = -1; - local->op_errno = EIO; - goto unlock; - } - local->fctx->xl_array[index] = prev->this; - } - local->entry_count++; - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); - - if (!callcnt) { - /* TODO: if self-heal flag is set, do it */ - if (local->xattr_self_heal_needed) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: stripe info need to be healed", - local->loc.path); - } - - if (local->failed) - local->op_ret = -1; - - if (local->op_ret) - goto err; - - if (local->entry_count != local->fctx->stripe_count) { - gf_log (this->name, GF_LOG_ERROR, - "entry-count (%d) != stripe-count (%d)", - local->entry_count, local->fctx->stripe_count); - local->op_ret = -1; - local->op_errno = EIO; - goto err; - } - if (!local->fctx->stripe_size) { - gf_log (this->name, GF_LOG_ERROR, "stripe size not set"); - local->op_ret = -1; - local->op_errno = EIO; - goto err; - } - - local->call_count = local->fctx->stripe_count; - - trav = this->children; - while (trav) { - STACK_WIND (frame, stripe_open_cbk, trav->xlator, - trav->xlator->fops->open, &local->loc, - local->flags, local->fd, 0); - trav = trav->next; - } - } - - return 0; -err: - lfd = local->fd; - - STRIPE_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, - local->fd); - if (lfd) - fd_unref (lfd); -out: - return 0; -} - -/** - * stripe_open - - */ -int32_t -stripe_open (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, fd_t *fd, int32_t wbflags) -{ - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = 1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - priv = this->private; - trav = this->children; - - if (priv->first_child_down) { - op_errno = ENOTCONN; - goto err; - } - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - /* files opened in O_APPEND mode does not allow lseek() on fd */ - flags &= ~O_APPEND; - - local->fd = fd_ref (fd); - frame->local = local; - loc_copy (&local->loc, loc); - - /* Striped files */ - local->flags = flags; - local->call_count = priv->child_count; - local->stripe_size = stripe_get_matching_bs (loc->path, - priv->pattern, - priv->block_size); - - if (priv->xattr_supported) { - while (trav) { - STACK_WIND (frame, stripe_open_getxattr_cbk, - trav->xlator, trav->xlator->fops->getxattr, - loc, NULL); - trav = trav->next; - } - return 0; - } - local->fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t), - gf_stripe_mt_stripe_fd_ctx_t); - if (!local->fctx) { - op_errno = ENOMEM; - goto err; - } - - local->fctx->static_array = 1; - local->fctx->stripe_size = local->stripe_size; - local->fctx->stripe_count = priv->child_count; - local->fctx->xl_array = priv->xl_array; - - while (trav) { - STACK_WIND (frame, stripe_open_cbk, trav->xlator, - trav->xlator->fops->open, - &local->loc, local->flags, local->fd, - wbflags); - trav = trav->next; - } - return 0; -err: - STRIPE_STACK_UNWIND (open, frame, -1, op_errno, NULL); - return 0; -} - - -int32_t -stripe_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - fd_t *local_fd = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_ret = -1; - local->op_errno = op_errno; - } - - if (op_ret >= 0) - local->op_ret = op_ret; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local_fd = local->fd; - STRIPE_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); - if (local_fd) - fd_unref (local_fd); - } -out: - return 0; -} - - -int32_t -stripe_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) -{ - xlator_list_t *trav = NULL; - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - int32_t op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - priv = this->private; - trav = this->children; - - if (priv->first_child_down) { - op_errno = ENOTCONN; - goto err; - } - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - frame->local = local; - local->call_count = priv->child_count; - local->fd = fd_ref (fd); - - while (trav) { - STACK_WIND (frame, stripe_opendir_cbk, trav->xlator, - trav->xlator->fops->opendir, loc, fd); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (opendir, frame, -1, op_errno, NULL); - return 0; -} - -int32_t -stripe_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - } - if (op_ret >= 0) { - if (FIRST_CHILD(this) == prev->this) { - /* First successful call, copy the *lock */ - local->op_ret = op_ret; - local->lock = *lock; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - STRIPE_STACK_UNWIND (lk, frame, local->op_ret, - local->op_errno, &local->lock); - } -out: - return 0; -} - -int32_t -stripe_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct gf_flock *lock) -{ - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - stripe_private_t *priv = NULL; - int32_t op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - trav = this->children; - priv = this->private; - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_lk_cbk, trav->xlator, - trav->xlator->fops->lk, fd, cmd, lock); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (lk, frame, -1, op_errno, NULL); - return 0; -} - - -int32_t -stripe_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - } - if (op_ret >= 0) - local->op_ret = op_ret; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - STRIPE_STACK_UNWIND (flush, frame, local->op_ret, - local->op_errno); - } -out: - return 0; -} - -int32_t -stripe_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = 1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - priv = this->private; - trav = this->children; - - if (priv->first_child_down) { - op_errno = ENOTCONN; - goto err; - } - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_flush_cbk, trav->xlator, - trav->xlator->fops->flush, fd); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (flush, frame, -1, op_errno); - return 0; -} - - - -int32_t -stripe_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - } - if (op_ret >= 0) { - local->op_ret = op_ret; - if (FIRST_CHILD(this) == prev->this) { - local->pre_buf = *prebuf; - local->post_buf = *postbuf; - } - local->prebuf_blocks += prebuf->ia_blocks; - local->postbuf_blocks += postbuf->ia_blocks; - - if (local->prebuf_size < prebuf->ia_size) - local->prebuf_size = prebuf->ia_size; - - if (local->postbuf_size < postbuf->ia_size) - local->postbuf_size = postbuf->ia_size; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - if (local->op_ret != -1) { - local->pre_buf.ia_blocks = local->prebuf_blocks; - local->pre_buf.ia_size = local->prebuf_size; - local->post_buf.ia_blocks = local->postbuf_blocks; - local->post_buf.ia_size = local->postbuf_size; - } - - STRIPE_STACK_UNWIND (fsync, frame, local->op_ret, - local->op_errno, &local->pre_buf, - &local->post_buf); - } -out: - return 0; -} - -int32_t -stripe_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) -{ - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = 1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - priv = this->private; - trav = this->children; - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_fsync_cbk, trav->xlator, - trav->xlator->fops->fsync, fd, flags); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL); - return 0; -} - -int32_t -stripe_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - } - - if (op_ret == 0) { - local->op_ret = 0; - - if (FIRST_CHILD(this) == prev->this) - local->stbuf = *buf; - - local->stbuf_blocks += buf->ia_blocks; - if (local->stbuf_size < buf->ia_size) - local->stbuf_size = buf->ia_size; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - if (local->op_ret != -1) { - local->stbuf.ia_size = local->stbuf_size; - local->stbuf.ia_blocks = local->stbuf_blocks; - } - - STRIPE_STACK_UNWIND (fstat, frame, local->op_ret, - local->op_errno, &local->stbuf); - } - -out: - return 0; -} - -int32_t -stripe_fstat (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = 1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - priv = this->private; - trav = this->children; - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_fstat_cbk, trav->xlator, - trav->xlator->fops->fstat, fd); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (fstat, frame, -1, op_errno, NULL); - return 0; -} - - -int32_t -stripe_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) -{ - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = 1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - priv = this->private; - trav = this->children; - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_truncate_cbk, trav->xlator, - trav->xlator->fops->ftruncate, fd, offset); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int32_t -stripe_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - } - if (op_ret >= 0) - local->op_ret = op_ret; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - STRIPE_STACK_UNWIND (fsyncdir, frame, local->op_ret, - local->op_errno); - } -out: - return 0; -} - -int32_t -stripe_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) -{ - stripe_local_t *local = NULL; - stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = 1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - priv = this->private; - trav = this->children; - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - local->call_count = priv->child_count; - - while (trav) { - STACK_WIND (frame, stripe_fsyncdir_cbk, trav->xlator, - trav->xlator->fops->fsyncdir, fd, flags); - trav = trav->next; - } - - return 0; -err: - STRIPE_STACK_UNWIND (fsyncdir, frame, -1, op_errno); - return 0; -} - - -int32_t -stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) -{ - int32_t i = 0; - int32_t callcnt = 0; - int32_t count = 0; - stripe_local_t *local = NULL; - struct iovec *vec = NULL; - struct iatt tmp_stbuf = {0,}; - struct iobref *tmp_iobref = NULL; - struct iobuf *iobuf = NULL; - - if (!this || !frame || !frame->local) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret != -1) - if (local->stbuf_size < buf->ia_size) - local->stbuf_size = buf->ia_size; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - op_ret = 0; - - /* Keep extra space for filling in '\0's */ - vec = GF_CALLOC ((local->count * 2), sizeof (struct iovec), - gf_stripe_mt_iovec); - if (!vec) { - op_ret = -1; - goto done; - } - - for (i = 0; i < local->wind_count; i++) { - if (local->replies[i].op_ret) { - memcpy ((vec + count), local->replies[i].vector, - (local->replies[i].count * sizeof (struct iovec))); - count += local->replies[i].count; - op_ret += local->replies[i].op_ret; - } - if ((local->replies[i].op_ret < - local->replies[i].requested_size) && - (local->stbuf_size > (local->offset + op_ret))) { - /* Fill in 0s here */ - vec[count].iov_len = - (local->replies[i].requested_size - - local->replies[i].op_ret); - iobuf = iobuf_get (this->ctx->iobuf_pool); - if (!iobuf) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - op_ret = -1; - op_errno = ENOMEM; - goto done; - } - memset (iobuf->ptr, 0, vec[count].iov_len); - iobref_add (local->iobref, iobuf); - vec[count].iov_base = iobuf->ptr; - - op_ret += vec[count].iov_len; - count++; - } - GF_FREE (local->replies[i].vector); - } - - /* FIXME: notice that st_ino, and st_dev (gen) will be - * different than what inode will have. Make sure this doesn't - * cause any bugs at higher levels */ - memcpy (&tmp_stbuf, &local->replies[0].stbuf, - sizeof (struct iatt)); - tmp_stbuf.ia_size = local->stbuf_size; - - done: - GF_FREE (local->replies); - tmp_iobref = local->iobref; - fd_unref (local->fd); - STRIPE_STACK_UNWIND (readv, frame, op_ret, op_errno, vec, - count, &tmp_stbuf, tmp_iobref); - - iobref_unref (tmp_iobref); - if (vec) - GF_FREE (vec); - } -out: - return 0; -} - -/** - * stripe_readv_cbk - get all the striped reads, and order it properly, send it - * to above layer after putting it in a single vector. - */ -int32_t -stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, struct iobref *iobref) -{ - int32_t index = 0; - int32_t callcnt = 0; - int32_t final_count = 0; - int32_t need_to_check_proper_size = 0; - call_frame_t *mframe = NULL; - stripe_local_t *mlocal = NULL; - stripe_local_t *local = NULL; - struct iovec *final_vec = NULL; - struct iatt tmp_stbuf = {0,}; - struct iobref *tmp_iobref = NULL; - stripe_fd_ctx_t *fctx = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto end; - } - - local = frame->local; - index = local->node_index; - mframe = local->orig_frame; - if (!mframe) - goto out; - - mlocal = mframe->local; - if (!mlocal) - goto out; - - fctx = mlocal->fctx; - - LOCK (&mframe->lock); - { - mlocal->replies[index].op_ret = op_ret; - mlocal->replies[index].op_errno = op_errno; - mlocal->replies[index].requested_size = local->readv_size; - if (op_ret >= 0) { - mlocal->replies[index].stbuf = *stbuf; - mlocal->replies[index].count = count; - mlocal->replies[index].vector = iov_dup (vector, count); - - if (!mlocal->iobref) - mlocal->iobref = iobref_new (); - iobref_merge (mlocal->iobref, iobref); - } - callcnt = ++mlocal->call_count; - } - UNLOCK(&mframe->lock); - - if (callcnt == mlocal->wind_count) { - op_ret = 0; - - for (index=0; index < mlocal->wind_count; index++) { - /* check whether each stripe returned - * 'expected' number of bytes */ - if (mlocal->replies[index].op_ret == -1) { - op_ret = -1; - op_errno = mlocal->replies[index].op_errno; - break; - } - /* TODO: handle the 'holes' within the read range - properly */ - if (mlocal->replies[index].op_ret < - mlocal->replies[index].requested_size) { - need_to_check_proper_size = 1; - } - - op_ret += mlocal->replies[index].op_ret; - mlocal->count += mlocal->replies[index].count; - } - if (op_ret == -1) - goto done; - if (need_to_check_proper_size) - goto check_size; - - final_vec = GF_CALLOC (mlocal->count, sizeof (struct iovec), - gf_stripe_mt_iovec); - - if (!final_vec) { - op_ret = -1; - goto done; - } - - for (index = 0; index < mlocal->wind_count; index++) { - memcpy ((final_vec + final_count), - mlocal->replies[index].vector, - (mlocal->replies[index].count * - sizeof (struct iovec))); - final_count += mlocal->replies[index].count; - GF_FREE (mlocal->replies[index].vector); - } - - /* FIXME: notice that st_ino, and st_dev (gen) will be - * different than what inode will have. Make sure this doesn't - * cause any bugs at higher levels */ - memcpy (&tmp_stbuf, &mlocal->replies[0].stbuf, - sizeof (struct iatt)); - - done: - /* */ - GF_FREE (mlocal->replies); - tmp_iobref = mlocal->iobref; - fd_unref (mlocal->fd); - STRIPE_STACK_UNWIND (readv, mframe, op_ret, op_errno, final_vec, - final_count, &tmp_stbuf, tmp_iobref); - - iobref_unref (tmp_iobref); - if (final_vec) - GF_FREE (final_vec); - } - - goto out; - -check_size: - mlocal->call_count = fctx->stripe_count; - - for (index = 0; index < fctx->stripe_count; index++) { - STACK_WIND (mframe, stripe_readv_fstat_cbk, - (fctx->xl_array[index]), - (fctx->xl_array[index])->fops->fstat, - mlocal->fd); - } - -out: - STRIPE_STACK_DESTROY (frame); -end: - return 0; -} - - -int32_t -stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset) -{ - int32_t op_errno = EINVAL; - int32_t idx = 0; - int32_t index = 0; - int32_t num_stripe = 0; - int32_t off_index = 0; - size_t frame_size = 0; - off_t rounded_end = 0; - uint64_t tmp_fctx = 0; - uint64_t stripe_size = 0; - off_t rounded_start = 0; - off_t frame_offset = offset; - stripe_local_t *local = NULL; - call_frame_t *rframe = NULL; - stripe_local_t *rlocal = NULL; - stripe_fd_ctx_t *fctx = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - fd_ctx_get (fd, this, &tmp_fctx); - if (!tmp_fctx) { - op_errno = EBADFD; - goto err; - } - fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; - stripe_size = fctx->stripe_size; - - if (!stripe_size) { - gf_log (this->name, GF_LOG_DEBUG, - "Wrong stripe size for the file"); - goto err; - } - /* The file is stripe across the child nodes. Send the read request - * to the child nodes appropriately after checking which region of - * the file is in which child node. Always '0-<stripe_size>' part of - * the file resides in the first child. - */ - rounded_start = floor (offset, stripe_size); - rounded_end = roof (offset+size, stripe_size); - num_stripe = (rounded_end- rounded_start)/stripe_size; - - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - frame->local = local; - - /* This is where all the vectors should be copied. */ - local->replies = GF_CALLOC (num_stripe, sizeof (struct readv_replies), - gf_stripe_mt_readv_replies); - if (!local->replies) { - op_errno = ENOMEM; - goto err; - } - - off_index = (offset / stripe_size) % fctx->stripe_count; - local->wind_count = num_stripe; - local->readv_size = size; - local->offset = offset; - local->fd = fd_ref (fd); - local->fctx = fctx; - - for (index = off_index; index < (num_stripe + off_index); index++) { - rframe = copy_frame (frame); - rlocal = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!rlocal) { - op_errno = ENOMEM; - goto err; - } - - frame_size = min (roof (frame_offset+1, stripe_size), - (offset + size)) - frame_offset; - - rlocal->node_index = index - off_index; - rlocal->orig_frame = frame; - rlocal->readv_size = frame_size; - rframe->local = rlocal; - idx = (index % fctx->stripe_count); - STACK_WIND (rframe, stripe_readv_cbk, fctx->xl_array[idx], - fctx->xl_array[idx]->fops->readv, - fd, frame_size, frame_offset); - - frame_offset += frame_size; - } - - return 0; -err: - if (local && local->fd) - fd_unref (local->fd); - if (rframe) - STRIPE_STACK_DESTROY (rframe); - - STRIPE_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL); - return 0; -} - - -int32_t -stripe_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) -{ - int32_t callcnt = 0; - stripe_local_t *local = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = cookie; - local = frame->local; - - LOCK(&frame->lock); - { - callcnt = ++local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - local->op_ret = -1; - } - if (op_ret >= 0) { - local->op_ret += op_ret; - local->post_buf = *postbuf; - local->pre_buf = *prebuf; - } - } - UNLOCK (&frame->lock); - - if ((callcnt == local->wind_count) && local->unwind) { - STRIPE_STACK_UNWIND (writev, frame, local->op_ret, - local->op_errno, &local->pre_buf, - &local->post_buf); - } -out: - return 0; -} - -int32_t -stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) -{ - struct iovec *tmp_vec = NULL; - stripe_local_t *local = NULL; - stripe_fd_ctx_t *fctx = NULL; - int32_t op_errno = 1; - int32_t idx = 0; - int32_t total_size = 0; - int32_t offset_offset = 0; - int32_t remaining_size = 0; - int32_t tmp_count = count; - off_t fill_size = 0; - uint64_t stripe_size = 0; - uint64_t tmp_fctx = 0; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - - fd_ctx_get (fd, this, &tmp_fctx); - if (!tmp_fctx) { - op_errno = EINVAL; - goto err; - } - fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; - stripe_size = fctx->stripe_size; - - /* File has to be stripped across the child nodes */ - for (idx = 0; idx< count; idx ++) { - total_size += vector[idx].iov_len; - } - remaining_size = total_size; - - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - frame->local = local; - local->stripe_size = stripe_size; - - while (1) { - /* Send striped chunk of the vector to child - nodes appropriately. */ - idx = (((offset + offset_offset) / - local->stripe_size) % fctx->stripe_count); - - fill_size = (local->stripe_size - - ((offset + offset_offset) % local->stripe_size)); - if (fill_size > remaining_size) - fill_size = remaining_size; - - remaining_size -= fill_size; - - tmp_count = iov_subset (vector, count, offset_offset, - offset_offset + fill_size, NULL); - tmp_vec = GF_CALLOC (tmp_count, sizeof (struct iovec), - gf_stripe_mt_iovec); - if (!tmp_vec) { - op_errno = ENOMEM; - goto err; - } - tmp_count = iov_subset (vector, count, offset_offset, - offset_offset + fill_size, tmp_vec); - - local->wind_count++; - if (remaining_size == 0) - local->unwind = 1; - - STACK_WIND (frame, stripe_writev_cbk, fctx->xl_array[idx], - fctx->xl_array[idx]->fops->writev, fd, tmp_vec, - tmp_count, offset + offset_offset, iobref); - GF_FREE (tmp_vec); - offset_offset += fill_size; - if (remaining_size == 0) - break; - } - - return 0; -err: - STRIPE_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int32_t -stripe_release (xlator_t *this, fd_t *fd) -{ - uint64_t tmp_fctx = 0; - stripe_fd_ctx_t *fctx = NULL; - - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - fd_ctx_del (fd, this, &tmp_fctx); - if (!tmp_fctx) { - goto err; - } - - fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; - - if (!fctx->static_array) - GF_FREE (fctx->xl_array); - - GF_FREE (fctx); - -err: - return 0; -} - - -int32_t -notify (xlator_t *this, int32_t event, void *data, ...) -{ - stripe_private_t *priv = NULL; - int down_client = 0; - int i = 0; - - if (!this) - return 0; - - priv = this->private; - if (!priv) - return 0; - - switch (event) - { - case GF_EVENT_CHILD_UP: - case GF_EVENT_CHILD_CONNECTING: - { - /* get an index number to set */ - for (i = 0; i < priv->child_count; i++) { - if (data == priv->xl_array[i]) - break; - } - priv->state[i] = 1; - for (i = 0; i < priv->child_count; i++) { - if (!priv->state[i]) - down_client++; - } - - LOCK (&priv->lock); - { - priv->nodes_down = down_client; - if (data == FIRST_CHILD (this)) - priv->first_child_down = 0; - if (!priv->nodes_down) - default_notify (this, event, data); - } - UNLOCK (&priv->lock); - } - break; - case GF_EVENT_CHILD_DOWN: - { - /* get an index number to set */ - for (i = 0; i < priv->child_count; i++) { - if (data == priv->xl_array[i]) - break; - } - priv->state[i] = 0; - for (i = 0; i < priv->child_count; i++) { - if (!priv->state[i]) - down_client++; - } - - LOCK (&priv->lock); - { - priv->nodes_down = down_client; - - if (data == FIRST_CHILD (this)) - priv->first_child_down = 1; - if (priv->nodes_down) - default_notify (this, event, data); - } - UNLOCK (&priv->lock); - } - break; - - default: - { - /* */ - default_notify (this, event, data); - } - break; - } - - return 0; -} - -int -set_stripe_block_size (xlator_t *this, stripe_private_t *priv, char *data) -{ - int ret = -1; - char *tmp_str = NULL; - char *tmp_str1 = NULL; - char *dup_str = NULL; - char *stripe_str = NULL; - char *pattern = NULL; - char *num = NULL; - struct stripe_options *temp_stripeopt = NULL; - struct stripe_options *stripe_opt = NULL; - - if (!this || !priv || !data) - goto out; - - /* Get the pattern for striping. - "option block-size *avi:10MB" etc */ - stripe_str = strtok_r (data, ",", &tmp_str); - while (stripe_str) { - dup_str = gf_strdup (stripe_str); - stripe_opt = CALLOC (1, sizeof (struct stripe_options)); - if (!stripe_opt) { - GF_FREE (dup_str); - goto out; - } - - pattern = strtok_r (dup_str, ":", &tmp_str1); - num = strtok_r (NULL, ":", &tmp_str1); - if (!num) { - num = pattern; - pattern = "*"; - } - if (gf_string2bytesize (num, &stripe_opt->block_size) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", num); - goto out; - } - memcpy (stripe_opt->path_pattern, pattern, strlen (pattern)); - - gf_log (this->name, GF_LOG_DEBUG, - "block-size : pattern %s : size %"PRId64, - stripe_opt->path_pattern, stripe_opt->block_size); - - if (!priv->pattern) { - priv->pattern = stripe_opt; - } else { - temp_stripeopt = priv->pattern; - while (temp_stripeopt->next) - temp_stripeopt = temp_stripeopt->next; - temp_stripeopt->next = stripe_opt; - } - stripe_str = strtok_r (NULL, ",", &tmp_str); - GF_FREE (dup_str); - } - - ret = 0; -out: - return ret; -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - goto out; - - ret = xlator_mem_acct_init (this, gf_stripe_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - goto out; - } - -out: - return ret; -} -int -validate_options (xlator_t *this, char **op_errstr) -{ - int ret = 0; - volume_opt_list_t *vol_opt = NULL; - volume_opt_list_t *tmp; - - if (!this) { - gf_log (this->name, GF_LOG_DEBUG, "'this' not a valid ptr"); - ret =-1; - goto out; - } - - if (list_empty (&this->volume_options)) - goto out; - - vol_opt = list_entry (this->volume_options.next, - volume_opt_list_t, list); - list_for_each_entry_safe (vol_opt, tmp, &this->volume_options, list) { - ret = validate_xlator_volume_options_attacherr (this, - vol_opt->given_opt, - op_errstr); - } - -out: - - return ret; -} - -int -reconfigure (xlator_t *this, dict_t *options) -{ - - stripe_private_t *priv = NULL; - data_t *data = NULL; - int ret = 0; - - priv = this->private; - - data = dict_get (options, "block-size"); - if (data) { - gf_log (this->name, GF_LOG_TRACE,"Reconfiguring Stripe" - " Block-size"); - ret = set_stripe_block_size (this, priv, data->data); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "Reconfigue: Block-Size reconfiguration failed"); - ret = -1; - goto out; - } - gf_log (this->name, GF_LOG_TRACE, - "Reconfigue: Block-Size reconfigured Successfully"); - } - else { - priv->block_size = (128 * GF_UNIT_KB); - } - -out: - return ret; - -} - -/** - * init - This function is called when xlator-graph gets initialized. - * The option given in volfiles are parsed here. - * @this - - */ -int32_t -init (xlator_t *this) -{ - stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - data_t *data = NULL; - int32_t count = 0; - int ret = -1; - - if (!this) - goto out; - - trav = this->children; - while (trav) { - count++; - trav = trav->next; - } - - if (!count) { - gf_log (this->name, GF_LOG_ERROR, - "stripe configured without \"subvolumes\" option. " - "exiting"); - goto out; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - if (count == 1) { - gf_log (this->name, GF_LOG_ERROR, - "stripe configured with only one \"subvolumes\" option." - " please check the volume. exiting"); - goto out; - } - - priv = GF_CALLOC (1, sizeof (stripe_private_t), - gf_stripe_mt_stripe_private_t); - - if (!priv) - goto out; - priv->xl_array = GF_CALLOC (count, sizeof (xlator_t *), - gf_stripe_mt_xlator_t); - if (!priv->xl_array) - goto out; - - priv->state = GF_CALLOC (count, sizeof (int8_t), - gf_stripe_mt_int8_t); - if (!priv->state) - goto out; - - priv->child_count = count; - LOCK_INIT (&priv->lock); - - trav = this->children; - count = 0; - while (trav) { - priv->xl_array[count++] = trav->xlator; - trav = trav->next; - } - - if (count > 256) { - gf_log (this->name, GF_LOG_ERROR, - "maximum number of stripe subvolumes supported " - "is 256"); - goto out; - } - - priv->block_size = (128 * GF_UNIT_KB); - /* option stripe-pattern *avi:1GB,*pdf:4096 */ - data = dict_get (this->options, "block-size"); - if (!data) { - gf_log (this->name, GF_LOG_DEBUG, - "No \"option block-size <x>\" given, defaulting " - "to 128KB"); - } else { - ret = set_stripe_block_size (this, priv, data->data); - if (ret) - goto out; - } - - priv->xattr_supported = 1; - data = dict_get (this->options, "use-xattr"); - if (data) { - if (gf_string2boolean (data->data, - &priv->xattr_supported) == -1) { - gf_log (this->name, GF_LOG_ERROR, - "error setting hard check for extended " - "attribute"); - //return -1; - } - } - - /* notify related */ - priv->nodes_down = priv->child_count; - this->private = priv; - - - ret = 0; -out: - if (ret) { - if (priv) { - if (priv->xl_array) - GF_FREE (priv->xl_array); - GF_FREE (priv); - } - } - return ret; -} - -/** - * fini - Free all the private variables - * @this - - */ -void -fini (xlator_t *this) -{ - stripe_private_t *priv = NULL; - struct stripe_options *prev = NULL; - struct stripe_options *trav = NULL; - - if (!this) - goto out; - - priv = this->private; - if (priv) { - this->private = NULL; - if (priv->xl_array) - GF_FREE (priv->xl_array); - - trav = priv->pattern; - while (trav) { - prev = trav; - trav = trav->next; - FREE (prev); - } - LOCK_DESTROY (&priv->lock); - GF_FREE (priv); - } - -out: - return; -} - -int32_t -stripe_getxattr_unwind (call_frame_t *frame, - int op_ret, int op_errno, dict_t *dict) - -{ - STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict); - return 0; -} - -int32_t -stripe_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) -{ - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - stripe_private_t *priv = NULL; - int32_t op_errno = EINVAL; - int i = 0; - xlator_t **sub_volumes; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (loc->inode, err); - - priv = this->private; - trav = this->children; - - /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); - if (!local) { - op_errno = ENOMEM; - goto err; - } - local->op_ret = -1; - frame->local = local; - loc_copy (&local->loc, loc); - - - if (name && (strcmp (GF_XATTR_MARKER_KEY, name) == 0) - && (-1 == frame->root->pid)) { - local->marker.call_count = priv->child_count; - - sub_volumes = alloca ( priv->child_count * - sizeof (xlator_t *)); - for (i = 0, trav = this->children; trav ; - trav = trav->next, i++) { - - *(sub_volumes + i) = trav->xlator; - - } - - if (cluster_getmarkerattr (frame, this, loc, name, - local, stripe_getxattr_unwind, - sub_volumes, priv->child_count, - MARKER_UUID_TYPE, priv->vol_uuid)) { - op_errno = EINVAL; - goto err; - } - - return 0; - } - - if (*priv->vol_uuid) { - if ((match_uuid_local (name, priv->vol_uuid) == 0) - && (-1 == frame->root->pid)) { - local->marker.call_count = priv->child_count; - - sub_volumes = alloca ( priv->child_count * - sizeof (xlator_t *)); - for (i = 0, trav = this->children; trav ; - trav = trav->next, i++) { - - *(sub_volumes + i) = trav->xlator; - - } - - if (cluster_getmarkerattr (frame, this, loc, name, - local, stripe_getxattr_unwind, - sub_volumes, - priv->child_count, - MARKER_XTIME_TYPE, - priv->vol_uuid)) { - op_errno = EINVAL; - goto err; - } - return 0; - } - } - - - STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, loc, name); - - return 0; - -err: - STACK_UNWIND_STRICT (getxattr, frame, -1, op_errno, NULL); - return 0; -} - - - -struct xlator_fops fops = { - .stat = stripe_stat, - .unlink = stripe_unlink, - .rename = stripe_rename, - .link = stripe_link, - .truncate = stripe_truncate, - .create = stripe_create, - .open = stripe_open, - .readv = stripe_readv, - .writev = stripe_writev, - .statfs = stripe_statfs, - .flush = stripe_flush, - .fsync = stripe_fsync, - .ftruncate = stripe_ftruncate, - .fstat = stripe_fstat, - .mkdir = stripe_mkdir, - .rmdir = stripe_rmdir, - .lk = stripe_lk, - .opendir = stripe_opendir, - .fsyncdir = stripe_fsyncdir, - .setattr = stripe_setattr, - .fsetattr = stripe_fsetattr, - .lookup = stripe_lookup, - .mknod = stripe_mknod, - - .getxattr = stripe_getxattr, -}; - -struct xlator_cbks cbks = { - .release = stripe_release, -}; - - -struct volume_options options[] = { - { .key = {"block-size"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"use-xattr"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {NULL} }, -}; diff --git a/xlators/cluster/stripe/src/stripe.h b/xlators/cluster/stripe/src/stripe.h deleted file mode 100644 index 86555144f6d..00000000000 --- a/xlators/cluster/stripe/src/stripe.h +++ /dev/null @@ -1,188 +0,0 @@ -/* - Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - - -#ifndef _STRIPE_H_ -#define _STRIPE_H_ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "logging.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat.h" -#include "compat-errno.h" -#include "stripe-mem-types.h" -#include "libxlator.h" -#include <fnmatch.h> -#include <signal.h> - - -#define STRIPE_STACK_UNWIND(fop, frame, params ...) do { \ - stripe_local_t *__local = NULL; \ - if (frame) { \ - __local = frame->local; \ - frame->local = NULL; \ - } \ - STACK_UNWIND_STRICT (fop, frame, params); \ - if (__local) { \ - stripe_local_wipe(__local); \ - GF_FREE (__local); \ - } \ - } while (0) - -#define STRIPE_STACK_DESTROY(frame) do { \ - stripe_local_t *__local = NULL; \ - __local = frame->local; \ - frame->local = NULL; \ - STACK_DESTROY (frame->root); \ - if (__local) { \ - stripe_local_wipe (__local); \ - GF_FREE (__local); \ - } \ - } while (0) - -/** - * struct stripe_options : This keeps the pattern and the block-size - * information, which is used for striping on a file. - */ -struct stripe_options { - struct stripe_options *next; - char path_pattern[256]; - uint64_t block_size; -}; - -/** - * Private structure for stripe translator - */ -struct stripe_private { - struct stripe_options *pattern; - xlator_t **xl_array; - uint64_t block_size; - gf_lock_t lock; - uint8_t nodes_down; - int8_t first_child_down; - int8_t child_count; - int8_t *state; /* Current state of child node */ - gf_boolean_t xattr_supported; /* default yes */ - char vol_uuid[UUID_SIZE + 1]; -}; - -/** - * Used to keep info about the replies received from fops->readv calls - */ -struct readv_replies { - struct iovec *vector; - int32_t count; //count of vector - int32_t op_ret; //op_ret of readv - int32_t op_errno; - int32_t requested_size; - struct iatt stbuf; /* 'stbuf' is also a part of reply */ -}; - -typedef struct _stripe_fd_ctx { - off_t stripe_size; - int stripe_count; - int static_array; - xlator_t **xl_array; -} stripe_fd_ctx_t; - - -/** - * Local structure to be passed with all the frames in case of STACK_WIND - */ -struct stripe_local; /* this itself is used inside the structure; */ - -struct stripe_local { - struct stripe_local *next; - call_frame_t *orig_frame; - - stripe_fd_ctx_t *fctx; - - /* Used by _cbk functions */ - struct iatt stbuf; - struct iatt pre_buf; - struct iatt post_buf; - struct iatt preparent; - struct iatt postparent; - - off_t stbuf_size; - off_t prebuf_size; - off_t postbuf_size; - off_t preparent_size; - off_t postparent_size; - - blkcnt_t stbuf_blocks; - blkcnt_t prebuf_blocks; - blkcnt_t postbuf_blocks; - blkcnt_t preparent_blocks; - blkcnt_t postparent_blocks; - - struct readv_replies *replies; - struct statvfs statvfs_buf; - dir_entry_t *entry; - - int8_t revalidate; - int8_t failed; - int8_t unwind; - - size_t readv_size; - int32_t entry_count; - int32_t node_index; - int32_t call_count; - int32_t wind_count; /* used instead of child_cound - in case of read and write */ - int32_t op_ret; - int32_t op_errno; - int32_t count; - int32_t flags; - char *name; - inode_t *inode; - - loc_t loc; - loc_t loc2; - - /* For File I/O fops */ - dict_t *dict; - - struct marker_str marker; - - /* General usage */ - off_t offset; - off_t stripe_size; - - int xattr_self_heal_needed; - int entry_self_heal_needed; - - int8_t *list; - struct gf_flock lock; - fd_t *fd; - void *value; - struct iobref *iobref; -}; - -typedef struct stripe_local stripe_local_t; -typedef struct stripe_private stripe_private_t; - - -#endif /* _STRIPE_H_ */ diff --git a/xlators/cluster/unify/Makefile.am b/xlators/cluster/unify/Makefile.am deleted file mode 100644 index d471a3f9243..00000000000 --- a/xlators/cluster/unify/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = src - -CLEANFILES = diff --git a/xlators/cluster/unify/src/Makefile.am b/xlators/cluster/unify/src/Makefile.am deleted file mode 100644 index 2a1fe837263..00000000000 --- a/xlators/cluster/unify/src/Makefile.am +++ /dev/null @@ -1,16 +0,0 @@ - -xlator_LTLIBRARIES = unify.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/legacy/cluster - -unify_la_LDFLAGS = -module -avoidversion - -unify_la_SOURCES = unify.c unify-self-heal.c -unify_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -noinst_HEADERS = unify.h - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) - -CLEANFILES = - diff --git a/xlators/cluster/unify/src/unify-mem-types.h b/xlators/cluster/unify/src/unify-mem-types.h deleted file mode 100644 index dcf96477935..00000000000 --- a/xlators/cluster/unify/src/unify-mem-types.h +++ /dev/null @@ -1,41 +0,0 @@ - -/* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - - -#ifndef __UNIFY_MEM_TYPES_H__ -#define __UNIFY_MEM_TYPES_H__ - -#include "mem-types.h" - -enum gf_unify_mem_types_ { - gf_unify_mt_char = gf_common_mt_end + 1, - gf_unify_mt_int16_t, - gf_unify_mt_xlator_t, - gf_unify_mt_unify_private_t, - gf_unify_mt_xlator_list_t, - gf_unify_mt_dir_entry_t, - gf_unify_mt_off_t, - gf_unify_mt_int, - gf_unify_mt_unify_self_heal_struct, - gf_unify_mt_unify_local_t, - gf_unify_mt_end -}; -#endif - diff --git a/xlators/cluster/unify/src/unify-self-heal.c b/xlators/cluster/unify/src/unify-self-heal.c deleted file mode 100644 index 725523a2e42..00000000000 --- a/xlators/cluster/unify/src/unify-self-heal.c +++ /dev/null @@ -1,1239 +0,0 @@ -/* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -/** - * unify-self-heal.c : - * This file implements few functions which enables 'unify' translator - * to be consistent in its behaviour when - * > a node fails, - * > a node gets added, - * > a failed node comes back - * > a new namespace server is added (ie, an fresh namespace server). - * - * This functionality of 'unify' will enable glusterfs to support storage - * system failure, and maintain consistancy. This works both ways, ie, when - * an entry (either file or directory) is found on namespace server, and not - * on storage nodes, its created in storage nodes and vica-versa. - * - * The two fops, where it can be implemented are 'getdents ()' and 'lookup ()' - * - */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "unify.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "common-utils.h" - -int32_t -unify_sh_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count); - -int32_t -unify_sh_ns_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count); - -int32_t -unify_bgsh_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count); - -int32_t -unify_bgsh_ns_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count); - -/** - * unify_local_wipe - free all the extra allocation of local->* here. - */ -static void -unify_local_wipe (unify_local_t *local) -{ - /* Free the strdup'd variables in the local structure */ - if (local->name) { - GF_FREE (local->name); - } - - if (local->sh_struct) { - if (local->sh_struct->offset_list) - GF_FREE (local->sh_struct->offset_list); - - if (local->sh_struct->entry_list) - GF_FREE (local->sh_struct->entry_list); - - if (local->sh_struct->count_list) - GF_FREE (local->sh_struct->count_list); - - GF_FREE (local->sh_struct); - } - - loc_wipe (&local->loc1); - loc_wipe (&local->loc2); -} - -int32_t -unify_sh_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - inode_t *inode = NULL; - dict_t *tmp_dict = NULL; - dir_entry_t *prev, *entry, *trav; - - LOCK (&frame->lock); - { - /* if local->call_count == 0, that means, setdents on - * storagenodes is still pending. - */ - if (local->call_count) - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (callcnt == 0) { - if (local->sh_struct->entry_list[0]) { - prev = entry = local->sh_struct->entry_list[0]; - if (!entry) - return 0; - trav = entry->next; - while (trav) { - prev->next = trav->next; - GF_FREE (trav->name); - if (IA_ISLNK (trav->buf.ia_type)) - GF_FREE (trav->link); - GF_FREE (trav); - trav = prev->next; - } - GF_FREE (entry); - } - - if (!local->flags) { - if (local->sh_struct->count_list[0] >= - UNIFY_SELF_HEAL_GETDENTS_COUNT) { - /* count == size, that means, there are more entries - to read from */ - //local->call_count = 0; - local->sh_struct->offset_list[0] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND (frame, - unify_sh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[0], - GF_GET_DIR_ONLY); - } - } else { - inode = local->loc1.inode; - fd_unref (local->fd); - tmp_dict = local->dict; - - unify_local_wipe (local); - - STACK_UNWIND (frame, local->op_ret, local->op_errno, - inode, &local->stbuf, local->dict, - &local->oldpostparent); - if (tmp_dict) - dict_unref (tmp_dict); - } - } - - return 0; -} - - -int32_t -unify_sh_ns_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = 0; - unsigned long final = 0; - dir_entry_t *tmp = GF_CALLOC (1, sizeof (dir_entry_t), - gf_unify_mt_dir_entry_t); - - local->sh_struct->entry_list[0] = tmp; - local->sh_struct->count_list[0] = count; - if (entry) { - tmp->next = entry->next; - entry->next = NULL; - } - - if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) { - final = 1; - } - - LOCK (&frame->lock); - { - /* local->call_count will be '0' till now. make it 1 so, it - can be UNWIND'ed for the last call. */ - local->call_count = priv->child_count; - if (final) - local->flags = 1; - } - UNLOCK (&frame->lock); - - for (index = 0; index < priv->child_count; index++) - { - STACK_WIND_COOKIE (frame, - unify_sh_setdents_cbk, - (void *)index, - priv->xl_array[index], - priv->xl_array[index]->fops->setdents, - local->fd, GF_SET_DIR_ONLY, - local->sh_struct->entry_list[0], count); - } - - return 0; -} - -int32_t -unify_sh_ns_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = (long)cookie; - dir_entry_t *prev, *entry, *trav; - - LOCK (&frame->lock); - { - if (local->sh_struct->entry_list[index]) { - prev = entry = local->sh_struct->entry_list[index]; - trav = entry->next; - while (trav) { - prev->next = trav->next; - GF_FREE (trav->name); - if (IA_ISLNK (trav->buf.ia_type)) - GF_FREE (trav->link); - GF_FREE (trav); - trav = prev->next; - } - GF_FREE (entry); - } - } - UNLOCK (&frame->lock); - - if (local->sh_struct->count_list[index] < - UNIFY_SELF_HEAL_GETDENTS_COUNT) { - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - } else { - /* count == size, that means, there are more entries - to read from */ - local->sh_struct->offset_list[index] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND_COOKIE (frame, - unify_sh_getdents_cbk, - cookie, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[index], - GF_GET_ALL); - - gf_log (this->name, GF_LOG_DEBUG, - "readdir on (%s) with offset %"PRId64"", - priv->xl_array[index]->name, - local->sh_struct->offset_list[index]); - } - - if (!callcnt) { - /* All storage nodes have done unified setdents on NS node. - * Now, do getdents from NS and do setdents on storage nodes. - */ - - /* sh_struct->offset_list is no longer required for - storage nodes now */ - local->sh_struct->offset_list[0] = 0; /* reset */ - - STACK_WIND (frame, - unify_sh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_DIR_ONLY); - } - - return 0; -} - - -/** - * unify_sh_getdents_cbk - - */ -int32_t -unify_sh_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = (long)cookie; - dir_entry_t *tmp = NULL; - - if (op_ret >= 0 && count > 0) { - /* There is some dentry found, just send the dentry to NS */ - tmp = GF_CALLOC (1, sizeof (dir_entry_t), - gf_unify_mt_dir_entry_t); - local->sh_struct->entry_list[index] = tmp; - local->sh_struct->count_list[index] = count; - if (entry) { - tmp->next = entry->next; - entry->next = NULL; - } - STACK_WIND_COOKIE (frame, - unify_sh_ns_setdents_cbk, - cookie, - NS(this), - NS(this)->fops->setdents, - local->fd, - GF_SET_IF_NOT_PRESENT, - local->sh_struct->entry_list[index], - count); - return 0; - } - - if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) { - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - } else { - /* count == size, that means, there are more entries - to read from */ - local->sh_struct->offset_list[index] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND_COOKIE (frame, - unify_sh_getdents_cbk, - cookie, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[index], - GF_GET_ALL); - - gf_log (this->name, GF_LOG_DEBUG, - "readdir on (%s) with offset %"PRId64"", - priv->xl_array[index]->name, - local->sh_struct->offset_list[index]); - } - - if (!callcnt) { - /* All storage nodes have done unified setdents on NS node. - * Now, do getdents from NS and do setdents on storage nodes. - */ - - /* sh_struct->offset_list is no longer required for - storage nodes now */ - local->sh_struct->offset_list[0] = 0; /* reset */ - - STACK_WIND (frame, - unify_sh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_DIR_ONLY); - } - - return 0; -} - -/** - * unify_sh_opendir_cbk - - * - * @cookie: - */ -int32_t -unify_sh_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t index = 0; - inode_t *inode = NULL; - dict_t *tmp_dict = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret >= 0) { - local->op_ret = op_ret; - } else { - gf_log (this->name, GF_LOG_WARNING, "failed"); - local->failed = 1; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local->call_count = priv->child_count + 1; - - if (!local->failed) { - /* send getdents() namespace after finishing - storage nodes */ - local->call_count--; - - fd_bind (fd); - - if (local->call_count) { - /* Used as the offset index. This list keeps - * track of offset sent to each node during - * STACK_WIND. - */ - local->sh_struct->offset_list = - GF_CALLOC (priv->child_count, - sizeof (off_t), - gf_unify_mt_off_t); - ERR_ABORT (local->sh_struct->offset_list); - - local->sh_struct->entry_list = - GF_CALLOC (priv->child_count, - sizeof (dir_entry_t *), - gf_unify_mt_dir_entry_t); - ERR_ABORT (local->sh_struct->entry_list); - - local->sh_struct->count_list = - GF_CALLOC (priv->child_count, - sizeof (int), - gf_unify_mt_int); - ERR_ABORT (local->sh_struct->count_list); - - /* Send getdents on all the fds */ - for (index = 0; - index < priv->child_count; index++) { - STACK_WIND_COOKIE (frame, - unify_sh_getdents_cbk, - (void *)(long)index, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_ALL); - } - - /* did stack wind, so no need to unwind here */ - return 0; - } /* (local->call_count) */ - } /* (!local->failed) */ - - /* Opendir failed on one node. */ - inode = local->loc1.inode; - fd_unref (local->fd); - tmp_dict = local->dict; - - unify_local_wipe (local); - /* Only 'self-heal' failed, lookup() was successful. */ - local->op_ret = 0; - - /* This is lookup_cbk ()'s UNWIND. */ - STACK_UNWIND (frame, local->op_ret, local->op_errno, inode, - &local->stbuf, local->dict, &local->oldpostparent); - if (tmp_dict) - dict_unref (tmp_dict); - } - - return 0; -} - -/** - * gf_sh_checksum_cbk - - * - * @frame: frame used in lookup. get a copy of it, and use that copy. - * @this: pointer to unify xlator. - * @inode: pointer to inode, for which the consistency check is required. - * - */ -int32_t -unify_sh_checksum_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - uint8_t *file_checksum, - uint8_t *dir_checksum) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t index = 0; - int32_t callcnt = 0; - inode_t *inode = NULL; - dict_t *tmp_dict = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret >= 0) { - if (NS(this) == (xlator_t *)cookie) { - memcpy (local->sh_struct->ns_file_checksum, - file_checksum, NAME_MAX); - memcpy (local->sh_struct->ns_dir_checksum, - dir_checksum, NAME_MAX); - } else { - if (local->entry_count == 0) { - /* Initialize the dir_checksum to be - * used for comparision with other - * storage nodes. Should be done for - * the first successful call *only*. - */ - /* Using 'entry_count' as a flag */ - local->entry_count = 1; - memcpy (local->sh_struct->dir_checksum, - dir_checksum, NAME_MAX); - } - - /* Reply from the storage nodes */ - for (index = 0; - index < NAME_MAX; index++) { - /* Files should be present in - only one node */ - local->sh_struct->file_checksum[index] ^= file_checksum[index]; - - /* directory structure should be - same accross */ - if (local->sh_struct->dir_checksum[index] != dir_checksum[index]) - local->failed = 1; - } - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - for (index = 0; index < NAME_MAX ; index++) { - if (local->sh_struct->file_checksum[index] != - local->sh_struct->ns_file_checksum[index]) { - local->failed = 1; - break; - } - if (local->sh_struct->dir_checksum[index] != - local->sh_struct->ns_dir_checksum[index]) { - local->failed = 1; - break; - } - } - - if (local->failed) { - /* Log it, it should be a rare event */ - gf_log (this->name, GF_LOG_WARNING, - "Self-heal triggered on directory %s", - local->loc1.path); - - /* Any self heal will be done at directory level */ - local->call_count = 0; - local->op_ret = -1; - local->failed = 0; - - local->fd = fd_create (local->loc1.inode, - frame->root->pid); - - local->call_count = priv->child_count + 1; - - for (index = 0; - index < (priv->child_count + 1); index++) { - STACK_WIND_COOKIE (frame, - unify_sh_opendir_cbk, - priv->xl_array[index]->name, - priv->xl_array[index], - priv->xl_array[index]->fops->opendir, - &local->loc1, - local->fd); - } - /* opendir can be done on the directory */ - return 0; - } - - /* no mismatch */ - inode = local->loc1.inode; - tmp_dict = local->dict; - - unify_local_wipe (local); - - /* This is lookup_cbk ()'s UNWIND. */ - STACK_UNWIND (frame, - local->op_ret, - local->op_errno, - inode, - &local->stbuf, - local->dict, &local->oldpostparent); - if (tmp_dict) - dict_unref (tmp_dict); - } - - return 0; -} - -/* Foreground self-heal part over */ - -/* Background self-heal part */ - -int32_t -unify_bgsh_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - dir_entry_t *prev, *entry, *trav; - - LOCK (&frame->lock); - { - /* if local->call_count == 0, that means, setdents - on storagenodes is still pending. */ - if (local->call_count) - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - - if (callcnt == 0) { - if (local->sh_struct->entry_list[0]) { - prev = entry = local->sh_struct->entry_list[0]; - trav = entry->next; - while (trav) { - prev->next = trav->next; - GF_FREE (trav->name); - if (IA_ISLNK (trav->buf.ia_type)) - GF_FREE (trav->link); - GF_FREE (trav); - trav = prev->next; - } - GF_FREE (entry); - } - - if (!local->flags) { - if (local->sh_struct->count_list[0] >= - UNIFY_SELF_HEAL_GETDENTS_COUNT) { - /* count == size, that means, there are more - entries to read from */ - //local->call_count = 0; - local->sh_struct->offset_list[0] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND (frame, - unify_bgsh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[0], - GF_GET_DIR_ONLY); - } - } else { - fd_unref (local->fd); - unify_local_wipe (local); - STACK_DESTROY (frame->root); - } - } - - return 0; -} - - -int32_t -unify_bgsh_ns_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = 0; - unsigned long final = 0; - dir_entry_t *tmp = GF_CALLOC (1, sizeof (dir_entry_t), - gf_unify_mt_dir_entry_t); - - local->sh_struct->entry_list[0] = tmp; - local->sh_struct->count_list[0] = count; - if (entry) { - tmp->next = entry->next; - entry->next = NULL; - } - - if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) { - final = 1; - } - - LOCK (&frame->lock); - { - /* local->call_count will be '0' till now. make it 1 so, - it can be UNWIND'ed for the last call. */ - local->call_count = priv->child_count; - if (final) - local->flags = 1; - } - UNLOCK (&frame->lock); - - for (index = 0; index < priv->child_count; index++) - { - STACK_WIND_COOKIE (frame, - unify_bgsh_setdents_cbk, - (void *)index, - priv->xl_array[index], - priv->xl_array[index]->fops->setdents, - local->fd, GF_SET_DIR_ONLY, - local->sh_struct->entry_list[0], count); - } - - return 0; -} - -int32_t -unify_bgsh_ns_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = (long)cookie; - dir_entry_t *prev, *entry, *trav; - - if (local->sh_struct->entry_list[index]) { - prev = entry = local->sh_struct->entry_list[index]; - if (!entry) - return 0; - trav = entry->next; - while (trav) { - prev->next = trav->next; - GF_FREE (trav->name); - if (IA_ISLNK (trav->buf.ia_type)) - GF_FREE (trav->link); - GF_FREE (trav); - trav = prev->next; - } - GF_FREE (entry); - } - - if (local->sh_struct->count_list[index] < - UNIFY_SELF_HEAL_GETDENTS_COUNT) { - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - } else { - /* count == size, that means, there are more entries - to read from */ - local->sh_struct->offset_list[index] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND_COOKIE (frame, - unify_bgsh_getdents_cbk, - cookie, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[index], - GF_GET_ALL); - - gf_log (this->name, GF_LOG_DEBUG, - "readdir on (%s) with offset %"PRId64"", - priv->xl_array[index]->name, - local->sh_struct->offset_list[index]); - } - - if (!callcnt) { - /* All storage nodes have done unified setdents on NS node. - * Now, do getdents from NS and do setdents on storage nodes. - */ - - /* sh_struct->offset_list is no longer required for - storage nodes now */ - local->sh_struct->offset_list[0] = 0; /* reset */ - - STACK_WIND (frame, - unify_bgsh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_DIR_ONLY); - } - - return 0; -} - - -/** - * unify_bgsh_getdents_cbk - - */ -int32_t -unify_bgsh_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = (long)cookie; - dir_entry_t *tmp = NULL; - - if (op_ret >= 0 && count > 0) { - /* There is some dentry found, just send the dentry to NS */ - tmp = GF_CALLOC (1, sizeof (dir_entry_t), - gf_unify_mt_dir_entry_t); - local->sh_struct->entry_list[index] = tmp; - local->sh_struct->count_list[index] = count; - if (entry) { - tmp->next = entry->next; - entry->next = NULL; - } - STACK_WIND_COOKIE (frame, - unify_bgsh_ns_setdents_cbk, - cookie, - NS(this), - NS(this)->fops->setdents, - local->fd, - GF_SET_IF_NOT_PRESENT, - local->sh_struct->entry_list[index], - count); - return 0; - } - - if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) { - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - } else { - /* count == size, that means, there are more entries to read from */ - local->sh_struct->offset_list[index] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - - STACK_WIND_COOKIE (frame, - unify_bgsh_getdents_cbk, - cookie, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[index], - GF_GET_ALL); - - gf_log (this->name, GF_LOG_DEBUG, - "readdir on (%s) with offset %"PRId64"", - priv->xl_array[index]->name, - local->sh_struct->offset_list[index]); - } - - if (!callcnt) { - /* All storage nodes have done unified setdents on NS node. - * Now, do getdents from NS and do setdents on storage nodes. - */ - - /* sh_struct->offset_list is no longer required for - storage nodes now */ - local->sh_struct->offset_list[0] = 0; /* reset */ - - STACK_WIND (frame, - unify_bgsh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_DIR_ONLY); - } - - return 0; -} - -/** - * unify_bgsh_opendir_cbk - - * - * @cookie: - */ -int32_t -unify_bgsh_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int32_t callcnt = 0; - int16_t index = 0; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret >= 0) { - local->op_ret = op_ret; - } else { - local->failed = 1; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local->call_count = priv->child_count + 1; - - if (!local->failed) { - /* send getdents() namespace after finishing - storage nodes */ - local->call_count--; - callcnt = local->call_count; - - fd_bind (fd); - - if (local->call_count) { - /* Used as the offset index. This list keeps - track of offset sent to each node during - STACK_WIND. */ - local->sh_struct->offset_list = - GF_CALLOC (priv->child_count, - sizeof (off_t), - gf_unify_mt_off_t); - ERR_ABORT (local->sh_struct->offset_list); - - local->sh_struct->entry_list = - GF_CALLOC (priv->child_count, - sizeof (dir_entry_t *), - gf_unify_mt_dir_entry_t); - ERR_ABORT (local->sh_struct->entry_list); - - local->sh_struct->count_list = - GF_CALLOC (priv->child_count, - sizeof (int), - gf_unify_mt_int); - ERR_ABORT (local->sh_struct->count_list); - - /* Send getdents on all the fds */ - for (index = 0; - index < priv->child_count; index++) { - STACK_WIND_COOKIE (frame, - unify_bgsh_getdents_cbk, - (void *)(long)index, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_ALL); - } - /* did a stack wind, so no need to unwind here */ - return 0; - } /* (local->call_count) */ - } /* (!local->failed) */ - - /* Opendir failed on one node. */ - fd_unref (local->fd); - - unify_local_wipe (local); - STACK_DESTROY (frame->root); - } - - return 0; -} - -/** - * gf_bgsh_checksum_cbk - - * - * @frame: frame used in lookup. get a copy of it, and use that copy. - * @this: pointer to unify xlator. - * @inode: pointer to inode, for which the consistency check is required. - * - */ -int32_t -unify_bgsh_checksum_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - uint8_t *file_checksum, - uint8_t *dir_checksum) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t index = 0; - int32_t callcnt = 0; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret >= 0) { - if (NS(this) == (xlator_t *)cookie) { - memcpy (local->sh_struct->ns_file_checksum, - file_checksum, NAME_MAX); - memcpy (local->sh_struct->ns_dir_checksum, - dir_checksum, NAME_MAX); - } else { - if (local->entry_count == 0) { - /* Initialize the dir_checksum to be - * used for comparision with other - * storage nodes. Should be done for - * the first successful call *only*. - */ - /* Using 'entry_count' as a flag */ - local->entry_count = 1; - memcpy (local->sh_struct->dir_checksum, - dir_checksum, NAME_MAX); - } - - /* Reply from the storage nodes */ - for (index = 0; - index < NAME_MAX; index++) { - /* Files should be present in only - one node */ - local->sh_struct->file_checksum[index] ^= file_checksum[index]; - - /* directory structure should be same - accross */ - if (local->sh_struct->dir_checksum[index] != dir_checksum[index]) - local->failed = 1; - } - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - for (index = 0; index < NAME_MAX ; index++) { - if (local->sh_struct->file_checksum[index] != - local->sh_struct->ns_file_checksum[index]) { - local->failed = 1; - break; - } - if (local->sh_struct->dir_checksum[index] != - local->sh_struct->ns_dir_checksum[index]) { - local->failed = 1; - break; - } - } - - if (local->failed) { - /* Log it, it should be a rare event */ - gf_log (this->name, GF_LOG_WARNING, - "Self-heal triggered on directory %s", - local->loc1.path); - - /* Any self heal will be done at the directory level */ - local->op_ret = -1; - local->failed = 0; - - local->fd = fd_create (local->loc1.inode, - frame->root->pid); - local->call_count = priv->child_count + 1; - - for (index = 0; - index < (priv->child_count + 1); index++) { - STACK_WIND_COOKIE (frame, - unify_bgsh_opendir_cbk, - priv->xl_array[index]->name, - priv->xl_array[index], - priv->xl_array[index]->fops->opendir, - &local->loc1, - local->fd); - } - - /* opendir can be done on the directory */ - return 0; - } - - /* no mismatch */ - unify_local_wipe (local); - STACK_DESTROY (frame->root); - } - - return 0; -} - -/* Background self-heal part over */ - - - - -/** - * zr_unify_self_heal - - * - * @frame: frame used in lookup. get a copy of it, and use that copy. - * @this: pointer to unify xlator. - * @inode: pointer to inode, for which the consistency check is required. - * - */ -int32_t -zr_unify_self_heal (call_frame_t *frame, - xlator_t *this, - unify_local_t *local) -{ - unify_private_t *priv = this->private; - call_frame_t *bg_frame = NULL; - unify_local_t *bg_local = NULL; - inode_t *tmp_inode = NULL; - dict_t *tmp_dict = NULL; - int16_t index = 0; - - if (local->inode_generation < priv->inode_generation) { - /* Any self heal will be done at the directory level */ - /* Update the inode's generation to the current generation - value. */ - local->inode_generation = priv->inode_generation; - inode_ctx_put (local->loc1.inode, this, - (uint64_t)(long)local->inode_generation); - - if (priv->self_heal == ZR_UNIFY_FG_SELF_HEAL) { - local->op_ret = 0; - local->failed = 0; - local->call_count = priv->child_count + 1; - local->sh_struct = - GF_CALLOC (1, sizeof (struct unify_self_heal_struct), - gf_unify_mt_unify_self_heal_struct); - - /* +1 is for NS */ - for (index = 0; - index < (priv->child_count + 1); index++) { - STACK_WIND_COOKIE (frame, - unify_sh_checksum_cbk, - priv->xl_array[index], - priv->xl_array[index], - priv->xl_array[index]->fops->checksum, - &local->loc1, - 0); - } - - /* Self-heal in foreground, hence no need - to UNWIND here */ - return 0; - } - - /* Self Heal done in background */ - bg_frame = copy_frame (frame); - INIT_LOCAL (bg_frame, bg_local); - loc_copy (&bg_local->loc1, &local->loc1); - bg_local->op_ret = 0; - bg_local->failed = 0; - bg_local->call_count = priv->child_count + 1; - bg_local->sh_struct = - GF_CALLOC (1, sizeof (struct unify_self_heal_struct), - gf_unify_mt_unify_self_heal_struct); - - /* +1 is for NS */ - for (index = 0; index < (priv->child_count + 1); index++) { - STACK_WIND_COOKIE (bg_frame, - unify_bgsh_checksum_cbk, - priv->xl_array[index], - priv->xl_array[index], - priv->xl_array[index]->fops->checksum, - &bg_local->loc1, - 0); - } - } - - /* generation number matches, self heal already done or - * self heal done in background: just do STACK_UNWIND - */ - tmp_inode = local->loc1.inode; - tmp_dict = local->dict; - - unify_local_wipe (local); - - /* This is lookup_cbk ()'s UNWIND. */ - STACK_UNWIND (frame, - local->op_ret, - local->op_errno, - tmp_inode, - &local->stbuf, - local->dict, - &local->oldpostparent); - - if (tmp_dict) - dict_unref (tmp_dict); - - return 0; -} - diff --git a/xlators/cluster/unify/src/unify.c b/xlators/cluster/unify/src/unify.c deleted file mode 100644 index 422c8d6d8e2..00000000000 --- a/xlators/cluster/unify/src/unify.c +++ /dev/null @@ -1,4589 +0,0 @@ -/* - Copyright (c) 2006-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -/** - * xlators/cluster/unify: - * - This xlator is one of the main translator in GlusterFS, which - * actually does the clustering work of the file system. One need to - * understand that, unify assumes file to be existing in only one of - * the child node, and directories to be present on all the nodes. - * - * NOTE: - * Now, unify has support for global namespace, which is used to keep a - * global view of fs's namespace tree. The stat for directories are taken - * just from the namespace, where as for files, just 'ia_ino' is taken from - * Namespace node, and other stat info is taken from the actual storage node. - * Also Namespace node helps to keep consistant inode for files across - * glusterfs (re-)mounts. - */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "unify.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "defaults.h" -#include "common-utils.h" -#include <signal.h> -#include <libgen.h> -#include "compat-errno.h" -#include "compat.h" - -#define UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR(_loc) do { \ - if (!(_loc && _loc->inode)) { \ - STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); \ - return 0; \ - } \ -} while(0) - - -#define UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(_fd) do { \ - if (!(_fd && !fd_ctx_get (_fd, this, NULL))) { \ - STACK_UNWIND (frame, -1, EBADFD, NULL, NULL); \ - return 0; \ - } \ -} while(0) - -#define UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(_fd) do { \ - if (!_fd) { \ - STACK_UNWIND (frame, -1, EBADFD, NULL, NULL); \ - return 0; \ - } \ -} while(0) - -/** - * unify_local_wipe - free all the extra allocation of local->* here. - */ -static void -unify_local_wipe (unify_local_t *local) -{ - /* Free the strdup'd variables in the local structure */ - if (local->name) { - GF_FREE (local->name); - } - loc_wipe (&local->loc1); - loc_wipe (&local->loc2); -} - - - -/* - * unify_normalize_stats - - */ -void -unify_normalize_stats (struct statvfs *buf, - unsigned long bsize, - unsigned long frsize) -{ - double factor; - - if (buf->f_bsize != bsize) { - factor = ((double) buf->f_bsize) / bsize; - buf->f_bsize = bsize; - buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree); - buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail); - } - - if (buf->f_frsize != frsize) { - factor = ((double) buf->f_frsize) / frsize; - buf->f_frsize = frsize; - buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks); - } -} - - -xlator_t * -unify_loc_subvol (loc_t *loc, xlator_t *this) -{ - unify_private_t *priv = NULL; - xlator_t *subvol = NULL; - int16_t *list = NULL; - long index = 0; - xlator_t *subvol_i = NULL; - int ret = 0; - uint64_t tmp_list = 0; - - priv = this->private; - subvol = NS (this); - - if (!IA_ISDIR (loc->inode->ia_type)) { - ret = inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - if (!list) - goto out; - - for (index = 0; list[index] != -1; index++) { - subvol_i = priv->xl_array[list[index]]; - if (subvol_i != NS (this)) { - subvol = subvol_i; - break; - } - } - } -out: - return subvol; -} - - - -/** - * unify_statfs_cbk - - */ -int32_t -unify_statfs_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct statvfs *stbuf) -{ - int32_t callcnt = 0; - struct statvfs *dict_buf = NULL; - unsigned long bsize; - unsigned long frsize; - unify_local_t *local = (unify_local_t *)frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - /* when a call is successfull, add it to local->dict */ - dict_buf = &local->statvfs_buf; - - if (dict_buf->f_bsize != 0) { - bsize = max (dict_buf->f_bsize, - stbuf->f_bsize); - - frsize = max (dict_buf->f_frsize, - stbuf->f_frsize); - unify_normalize_stats(dict_buf, bsize, frsize); - unify_normalize_stats(stbuf, bsize, frsize); - } else { - dict_buf->f_bsize = stbuf->f_bsize; - dict_buf->f_frsize = stbuf->f_frsize; - } - - dict_buf->f_blocks += stbuf->f_blocks; - dict_buf->f_bfree += stbuf->f_bfree; - dict_buf->f_bavail += stbuf->f_bavail; - dict_buf->f_files += stbuf->f_files; - dict_buf->f_ffree += stbuf->f_ffree; - dict_buf->f_favail += stbuf->f_favail; - dict_buf->f_fsid = stbuf->f_fsid; - dict_buf->f_flag = stbuf->f_flag; - dict_buf->f_namemax = stbuf->f_namemax; - local->op_ret = op_ret; - } else { - /* fop on storage node has failed due to some error */ - if (op_errno != ENOTCONN) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): %s", - prev_frame->this->name, - strerror (op_errno)); - } - local->op_errno = op_errno; - } - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->statvfs_buf); - } - - return 0; -} - -/** - * unify_statfs - - */ -int32_t -unify_statfs (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - unify_local_t *local = NULL; - xlator_list_t *trav = this->children; - - INIT_LOCAL (frame, local); - local->call_count = ((unify_private_t *)this->private)->child_count; - - while(trav) { - STACK_WIND (frame, - unify_statfs_cbk, - trav->xlator, - trav->xlator->fops->statfs, - loc); - trav = trav->next; - } - - return 0; -} - -/** - * unify_buf_cbk - - */ -int32_t -unify_buf_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s(): child(%s): path(%s): %s", - gf_fop_list[frame->root->op], - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - - local->op_errno = op_errno; - if ((op_errno == ENOENT) && priv->optimist) - local->op_ret = 0; - } - - if (op_ret >= 0) { - local->op_ret = 0; - - if (NS (this) == prev_frame->this) { - local->ia_ino = buf->ia_ino; - /* If the entry is directory, get the stat - from NS node */ - if (IA_ISDIR (buf->ia_type) || - !local->stbuf.ia_blksize) { - local->stbuf = *buf; - } - } - - if ((!IA_ISDIR (buf->ia_type)) && - (NS (this) != prev_frame->this)) { - /* If file, take the stat info from Storage - node. */ - local->stbuf = *buf; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - /* If the inode number is not filled, operation should - fail */ - if (!local->ia_ino) - local->op_ret = -1; - - local->stbuf.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf); - } - - return 0; -} - -#define check_if_dht_linkfile(s) \ - ((st_mode_from_ia (s->ia_prot, s->ia_type) & ~S_IFMT) == S_ISVTX) - -/** - * unify_lookup_cbk - - */ -int32_t -unify_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - inode_t *tmp_inode = NULL; - dict_t *local_dict = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - if (local->revalidate && - (op_errno == ESTALE)) { - /* ESTALE takes priority */ - local->op_errno = op_errno; - local->failed = 1; - } - - if ((op_errno != ENOTCONN) - && (op_errno != ENOENT) - && (local->op_errno != ESTALE)) { - /* if local->op_errno is already ESTALE, then - * ESTALE has to propogated to the parent first. - * do not enter here. - */ - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - local->failed = 1; - - } else if (local->revalidate && - (local->op_errno != ESTALE) && - !(priv->optimist && (op_errno == ENOENT))) { - - gf_log (this->name, - (op_errno == ENOTCONN) ? - GF_LOG_DEBUG:GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - local->failed = 1; - } - } - - if (op_ret == 0) { - local->op_ret = 0; - - if (check_if_dht_linkfile(buf)) { - gf_log (this->name, GF_LOG_CRITICAL, - "file %s may be DHT link file on %s, " - "make sure the backend is not shared " - "between unify and DHT", - local->loc1.path, - priv->xl_array[(long)cookie]->name); - } - - if (local->stbuf.ia_type && local->stbuf.ia_blksize) { - /* make sure we already have a stbuf - stored in local->stbuf */ - if (IA_ISDIR (local->stbuf.ia_type) && - !IA_ISDIR (buf->ia_type)) { - gf_log (this->name, GF_LOG_CRITICAL, - "[CRITICAL] '%s' is directory " - "on namespace, non-directory " - "on node '%s', returning EIO", - local->loc1.path, - priv->xl_array[(long)cookie]->name); - local->return_eio = 1; - } - if (!IA_ISDIR (local->stbuf.ia_type) && - IA_ISDIR (buf->ia_type)) { - gf_log (this->name, GF_LOG_CRITICAL, - "[CRITICAL] '%s' is directory " - "on node '%s', non-directory " - "on namespace, returning EIO", - local->loc1.path, - priv->xl_array[(long)cookie]->name); - local->return_eio = 1; - } - } - - if (!local->revalidate && !IA_ISDIR (buf->ia_type)) { - /* This is the first time lookup on file*/ - if (!local->list) { - /* list is not allocated, allocate - the max possible range */ - local->list = GF_CALLOC (1, 2 * (priv->child_count + 2), - gf_unify_mt_int16_t); - if (!local->list) { - gf_log (this->name, - GF_LOG_CRITICAL, - "Not enough memory"); - STACK_UNWIND (frame, -1, - ENOMEM, inode, - NULL, NULL, NULL); - return 0; - } - } - /* update the index of the list */ - local->list [local->index++] = - (int16_t)(long)cookie; - } - - if (!local->revalidate && IA_ISDIR (buf->ia_type)) { - /* fresh lookup of a directory */ - inode_ctx_put (local->loc1.inode, this, - priv->inode_generation); - } - - if ((!local->dict) && dict && - (priv->xl_array[(long)cookie] != NS(this))) { - local->dict = dict_ref (dict); - } - - /* index of NS node is == total child count */ - if (priv->child_count == (int16_t)(long)cookie) { - /* Take the inode number from namespace */ - local->ia_ino = buf->ia_ino; - if (IA_ISDIR (buf->ia_type) || - !(local->stbuf.ia_blksize)) { - local->stbuf = *buf; - local->oldpostparent = *postparent; - } - } else if (!IA_ISDIR (buf->ia_type)) { - /* If file, then get the stat from - storage node */ - local->stbuf = *buf; - } - - if (local->ia_nlink < buf->ia_nlink) { - local->ia_nlink = buf->ia_nlink; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local_dict = local->dict; - if (local->return_eio) { - gf_log (this->name, GF_LOG_CRITICAL, - "[CRITICAL] Unable to fix the path (%s) with " - "self-heal, try manual verification. " - "returning EIO.", local->loc1.path); - unify_local_wipe (local); - STACK_UNWIND (frame, -1, EIO, inode, NULL, NULL); - if (local_dict) { - dict_unref (local_dict); - } - return 0; - } - - if (!local->stbuf.ia_blksize) { - /* Inode not present */ - local->op_ret = -1; - } else { - if (!local->revalidate && - !IA_ISDIR (local->stbuf.ia_type)) { - /* If its a file, big array is useless, - allocate the smaller one */ - int16_t *list = NULL; - list = GF_CALLOC (1, 2 * (local->index + 1), - gf_unify_mt_int16_t); - ERR_ABORT (list); - memcpy (list, local->list, 2 * local->index); - /* Make the end of the list as -1 */ - GF_FREE (local->list); - local->list = list; - local->list [local->index] = -1; - /* Update the inode's ctx with proper array */ - /* TODO: log on failure */ - inode_ctx_put (local->loc1.inode, this, - (uint64_t)(long)local->list); - } - - if (IA_ISDIR(local->loc1.inode->ia_type)) { - /* lookup is done for directory */ - if (local->failed && priv->self_heal) { - /* Triggering self-heal */ - /* means, self-heal required for this - inode */ - local->inode_generation = 0; - priv->inode_generation++; - } - } else { - local->stbuf.ia_ino = local->ia_ino; - } - - local->stbuf.ia_nlink = local->ia_nlink; - } - if (local->op_ret == -1) { - if (!local->revalidate && local->list) - GF_FREE (local->list); - } - - if ((local->op_ret >= 0) && local->failed && - local->revalidate) { - /* Done revalidate, but it failed */ - if ((op_errno != ENOTCONN) - && (local->op_errno != ESTALE)) { - gf_log (this->name, GF_LOG_ERROR, - "Revalidate failed for path(%s): %s", - local->loc1.path, strerror (op_errno)); - } - local->op_ret = -1; - } - - if ((priv->self_heal && !priv->optimist) && - (!local->revalidate && (local->op_ret == 0) && - IA_ISDIR(local->stbuf.ia_type))) { - /* Let the self heal be done here */ - zr_unify_self_heal (frame, this, local); - local_dict = NULL; - } else { - if (local->failed) { - /* NOTE: directory lookup is sent to all - * subvolumes and success from a subvolume - * might set local->op_ret to 0 (zero) */ - local->op_ret = -1; - } - - /* either no self heal, or op_ret == -1 (failure) */ - tmp_inode = local->loc1.inode; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - tmp_inode, &local->stbuf, local->dict, - &local->oldpostparent); - } - if (local_dict) { - dict_unref (local_dict); - } - } - - return 0; -} - -/** - * unify_lookup - - */ -int32_t -unify_lookup (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *xattr_req) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int16_t *list = NULL; - long index = 0; - - if (!(loc && loc->inode)) { - gf_log (this->name, GF_LOG_ERROR, - "%s: Argument not right", loc?loc->path:"(null)"); - STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL, NULL); - return 0; - } - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL, NULL, NULL); - return 0; - } - - if (inode_ctx_get (loc->inode, this, NULL) - && IA_ISDIR (loc->inode->ia_type)) { - local->revalidate = 1; - } - - if (!inode_ctx_get (loc->inode, this, NULL) && - loc->inode->ia_type && - !IA_ISDIR (loc->inode->ia_type)) { - uint64_t tmp_list = 0; - /* check if revalidate or fresh lookup */ - inode_ctx_get (loc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - } - - if (local->list) { - list = local->list; - for (index = 0; list[index] != -1; index++); - if (index != 2) { - if (index < 2) { - gf_log (this->name, GF_LOG_ERROR, - "returning ESTALE for %s: file " - "count is %ld", loc->path, index); - /* Print where all the file is present */ - for (index = 0; - local->list[index] != -1; index++) { - gf_log (this->name, GF_LOG_ERROR, - "%s: found on %s", loc->path, - priv->xl_array[list[index]]->name); - } - unify_local_wipe (local); - STACK_UNWIND (frame, -1, ESTALE, - NULL, NULL, NULL, NULL); - return 0; - } else { - /* There are more than 2 presences */ - /* Just log and continue */ - gf_log (this->name, GF_LOG_ERROR, - "%s: file count is %ld", - loc->path, index); - /* Print where all the file is present */ - for (index = 0; - local->list[index] != -1; index++) { - gf_log (this->name, GF_LOG_ERROR, - "%s: found on %s", loc->path, - priv->xl_array[list[index]]->name); - } - } - } - - /* is revalidate */ - local->revalidate = 1; - - for (index = 0; list[index] != -1; index++) - local->call_count++; - - for (index = 0; list[index] != -1; index++) { - char need_break = (list[index+1] == -1); - STACK_WIND_COOKIE (frame, - unify_lookup_cbk, - (void *)(long)list[index], //cookie - priv->xl_array [list[index]], - priv->xl_array [list[index]]->fops->lookup, - loc, - xattr_req); - if (need_break) - break; - } - } else { - if (loc->inode->ia_type) { - if (inode_ctx_get (loc->inode, this, NULL)) { - inode_ctx_get (loc->inode, this, - &local->inode_generation); - } - } - /* This is first call, there is no list */ - /* call count should be all child + 1 namespace */ - local->call_count = priv->child_count + 1; - - for (index = 0; index <= priv->child_count; index++) { - STACK_WIND_COOKIE (frame, - unify_lookup_cbk, - (void *)index, //cookie - priv->xl_array[index], - priv->xl_array[index]->fops->lookup, - loc, - xattr_req); - } - } - - return 0; -} - -/** - * unify_stat - if directory, get the stat directly from NameSpace child. - * if file, check for a hint and send it only there (also to NS). - * if its a fresh stat, then do it on all the nodes. - * - * NOTE: for all the call, sending cookie as xlator pointer, which will be - * used in cbk. - */ -int32_t -unify_stat (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int16_t index = 0; - int16_t *list = NULL; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; - } - local->ia_ino = loc->inode->ino; - if (IA_ISDIR (loc->inode->ia_type)) { - /* Directory */ - local->call_count = 1; - STACK_WIND (frame, unify_buf_cbk, NS(this), - NS(this)->fops->stat, loc); - } else { - /* File */ - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) - local->call_count++; - - for (index = 0; list[index] != -1; index++) { - char need_break = (list[index+1] == -1); - STACK_WIND (frame, - unify_buf_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->stat, - loc); - if (need_break) - break; - } - } - - return 0; -} - -/** - * unify_access_cbk - - */ -int32_t -unify_access_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - -/** - * unify_access - Send request to only namespace, which has all the - * attributes set for the file. - */ -int32_t -unify_access (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t mask) -{ - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - STACK_WIND (frame, - unify_access_cbk, - NS(this), - NS(this)->fops->access, - loc, - mask); - - return 0; -} - -int32_t -unify_mkdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - inode_t *tmp_inode = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if ((op_ret == -1) && !(priv->optimist && - (op_errno == ENOENT || - op_errno == EEXIST))) { - /* TODO: Decrement the inode_generation of - * this->inode's parent inode, hence the missing - * directory is created properly by self-heal. - * Currently, there is no way to get the parent - * inode directly. - */ - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - if (op_errno != EEXIST) - local->failed = 1; - local->op_errno = op_errno; - } - - if (op_ret >= 0) - local->op_ret = 0; - - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (!local->failed) { - inode_ctx_put (local->loc1.inode, this, - priv->inode_generation); - } - - tmp_inode = local->loc1.inode; - unify_local_wipe (local); - - STACK_UNWIND (frame, local->op_ret, local->op_errno, - tmp_inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - } - - return 0; -} - -/** - * unify_ns_mkdir_cbk - - */ -int32_t -unify_ns_mkdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - long index = 0; - - if (op_ret == -1) { - /* No need to send mkdir request to other servers, - * as namespace action failed - */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s): %s", - local->name, strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, NULL, - NULL, NULL); - return 0; - } - - /* Create one inode for this entry */ - local->op_ret = 0; - local->stbuf = *buf; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - local->call_count = priv->child_count; - - /* Send mkdir request to all the nodes now */ - for (index = 0; index < priv->child_count; index++) { - STACK_WIND_COOKIE (frame, - unify_mkdir_cbk, - (void *)index, //cookie - priv->xl_array[index], - priv->xl_array[index]->fops->mkdir, - &local->loc1, - local->mode); - } - - return 0; -} - - -/** - * unify_mkdir - - */ -int32_t -unify_mkdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode) -{ - unify_local_t *local = NULL; - - /* Initialization */ - INIT_LOCAL (frame, local); - local->mode = mode; - - loc_copy (&local->loc1, loc); - - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_mkdir_cbk, - NS(this), - NS(this)->fops->mkdir, - loc, - mode); - return 0; -} - -/** - * unify_rmdir_cbk - - */ -int32_t -unify_rmdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == 0 || (priv->optimist && (op_errno == ENOENT))) - local->op_ret = 0; - if (op_ret == -1) - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->oldpreparent, &local->oldpostparent); - } - - return 0; -} - -/** - * unify_ns_rmdir_cbk - - */ -int32_t -unify_ns_rmdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - int16_t index = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - if (op_ret == -1) { - /* No need to send rmdir request to other servers, - * as namespace action failed - */ - gf_log (this->name, - ((op_errno != ENOTEMPTY) ? - GF_LOG_ERROR : GF_LOG_DEBUG), - "namespace: path(%s): %s", - local->loc1.path, strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL); - return 0; - } - - local->call_count = priv->child_count; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - for (index = 0; index < priv->child_count; index++) { - STACK_WIND (frame, - unify_rmdir_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->rmdir, - &local->loc1); - } - - return 0; -} - -/** - * unify_rmdir - - */ -int32_t -unify_rmdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - unify_local_t *local = NULL; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_rmdir_cbk, - NS(this), - NS(this)->fops->rmdir, - loc); - - return 0; -} - -/** - * unify_open_cbk - - */ -int32_t -unify_open_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - local->op_ret = op_ret; - if (NS(this) != (xlator_t *)cookie) { - /* Store child node's ptr, used in - all the f*** / FileIO calls */ - fd_ctx_set (fd, this, (uint64_t)(long)cookie); - } - } - if (op_ret == -1) { - local->op_errno = op_errno; - local->failed = 1; - } - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if ((local->failed == 1) && (local->op_ret >= 0)) { - local->call_count = 1; - /* return -1 to user */ - local->op_ret = -1; - //local->op_errno = EIO; - - if (!fd_ctx_get (local->fd, this, NULL)) { - gf_log (this->name, GF_LOG_ERROR, - "Open success on child node, " - "failed on namespace"); - } else { - gf_log (this->name, GF_LOG_ERROR, - "Open success on namespace, " - "failed on child node"); - } - } - - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, - local->op_errno, local->fd); - } - - return 0; -} - -#ifdef GF_DARWIN_HOST_OS -/** - * unify_create_lookup_cbk - - */ -int32_t -unify_open_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - int32_t callcnt = 0; - int16_t index = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if ((op_ret == -1) && (op_errno != ENOENT)) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - local->index++; - if (NS(this) == priv->xl_array[(long)cookie]) { - local->list[0] = (int16_t)(long)cookie; - } else { - local->list[1] = (int16_t)(long)cookie; - } - if (IA_ISDIR (buf->ia_type)) - local->failed = 1; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - int16_t file_list[3] = {0,}; - local->op_ret = -1; - - file_list[0] = local->list[0]; - file_list[1] = local->list[1]; - file_list[2] = -1; - - if (local->index != 2) { - /* Lookup failed, can't do open */ - gf_log (this->name, GF_LOG_ERROR, - "%s: present on %d nodes", - local->name, local->index); - - if (local->index < 2) { - unify_local_wipe (local); - gf_log (this->name, GF_LOG_ERROR, - "returning as file found on less " - "than 2 nodes"); - STACK_UNWIND (frame, local->op_ret, - local->op_errno, local->fd); - return 0; - } - } - - if (local->failed) { - /* Open on directory, return EISDIR */ - unify_local_wipe (local); - STACK_UNWIND (frame, -1, EISDIR, local->fd); - return 0; - } - - /* Everything is perfect :) */ - local->call_count = 2; - - for (index = 0; file_list[index] != -1; index++) { - char need_break = (file_list[index+1] == -1); - STACK_WIND_COOKIE (frame, - unify_open_cbk, - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]]->fops->open, - &local->loc1, - local->flags, - local->fd, local->wbflags); - if (need_break) - break; - } - } - - return 0; -} - - -int32_t -unify_open_readlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - const char *path, - struct iatt *sbuf) -{ - int16_t index = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - if (op_ret == -1) { - STACK_UNWIND (frame, -1, ENOENT); - return 0; - } - - if (path[0] == '/') { - local->name = gf_strdup (path); - ERR_ABORT (local->name); - } else { - char *tmp_str = gf_strdup (local->loc1.path); - char *tmp_base = dirname (tmp_str); - local->name = GF_CALLOC (1, ZR_PATH_MAX, gf_unify_mt_char); - strcpy (local->name, tmp_base); - strncat (local->name, "/", 1); - strcat (local->name, path); - GF_FREE (tmp_str); - } - - local->list = GF_CALLOC (1, sizeof (int16_t) * 3, - gf_unify_mt_int16_t); - ERR_ABORT (local->list); - local->call_count = priv->child_count + 1; - local->op_ret = -1; - for (index = 0; index <= priv->child_count; index++) { - /* Send the lookup to all the nodes including namespace */ - STACK_WIND_COOKIE (frame, - unify_open_lookup_cbk, - (void *)(long)index, - priv->xl_array[index], - priv->xl_array[index]->fops->lookup, - &local->loc1, - NULL); - } - - return 0; -} -#endif /* GF_DARWIN_HOST_OS */ - -/** - * unify_open - - */ -int32_t -unify_open (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - fd_t *fd, - int32_t wbflags) -{ - unify_private_t *priv = this->private; - unify_local_t *local = NULL; - int16_t *list = NULL; - int16_t index = 0; - int16_t file_list[3] = {0,}; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Init */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - local->fd = fd; - local->flags = flags; - local->wbflags = wbflags; - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - local->list = list; - file_list[0] = priv->child_count; /* Thats namespace */ - file_list[2] = -1; - for (index = 0; list[index] != -1; index++) { - local->call_count++; - if (list[index] != priv->child_count) - file_list[1] = list[index]; - } - - if (local->call_count != 2) { - /* If the lookup was done for file */ - gf_log (this->name, GF_LOG_ERROR, - "%s: entry_count is %d", - loc->path, local->call_count); - for (index = 0; local->list[index] != -1; index++) - gf_log (this->name, GF_LOG_ERROR, "%s: found on %s", - loc->path, priv->xl_array[list[index]]->name); - - if (local->call_count < 2) { - gf_log (this->name, GF_LOG_ERROR, - "returning EIO as file found on onlyone node"); - STACK_UNWIND (frame, -1, EIO, fd); - return 0; - } - } - -#ifdef GF_DARWIN_HOST_OS - /* Handle symlink here */ - if (IA_ISLNK (loc->inode->ia_type)) { - /* Callcount doesn't matter here */ - STACK_WIND (frame, - unify_open_readlink_cbk, - NS(this), - NS(this)->fops->readlink, - loc, ZR_PATH_MAX); - return 0; - } -#endif /* GF_DARWIN_HOST_OS */ - - local->call_count = 2; - for (index = 0; file_list[index] != -1; index++) { - char need_break = (file_list[index+1] == -1); - STACK_WIND_COOKIE (frame, - unify_open_cbk, - priv->xl_array[file_list[index]], //cookie - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]]->fops->open, - loc, - flags, - fd, wbflags); - if (need_break) - break; - } - - return 0; -} - - -int32_t -unify_create_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - inode_t *inode = local->loc1.inode; - - unify_local_wipe (local); - - STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd, - inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - - return 0; -} - -/** - * unify_create_open_cbk - - */ -int32_t -unify_create_open_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - int ret = 0; - int32_t callcnt = 0; - unify_local_t *local = frame->local; - inode_t *inode = NULL; - xlator_t *child = NULL; - uint64_t tmp_value = 0; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - local->op_ret = op_ret; - if (NS(this) != (xlator_t *)cookie) { - /* Store child node's ptr, used in all - the f*** / FileIO calls */ - /* TODO: log on failure */ - ret = fd_ctx_get (fd, this, &tmp_value); - cookie = (void *)(long)tmp_value; - } else { - /* NOTE: open successful on namespace. - * fd's ctx can be used to identify open - * failure on storage subvolume. cool - * ide ;) */ - local->failed = 0; - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - ((xlator_t *)cookie)->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - local->failed = 1; - } - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed == 1 && (local->op_ret >= 0)) { - local->call_count = 1; - /* return -1 to user */ - local->op_ret = -1; - local->op_errno = EIO; - local->fd = fd; - local->call_count = 1; - - if (!fd_ctx_get (local->fd, this, &tmp_value)) { - child = (xlator_t *)(long)tmp_value; - - gf_log (this->name, GF_LOG_ERROR, - "Create success on child node, " - "failed on namespace"); - - STACK_WIND (frame, - unify_create_unlink_cbk, - child, - child->fops->unlink, - &local->loc1); - } else { - gf_log (this->name, GF_LOG_ERROR, - "Create success on namespace, " - "failed on child node"); - - STACK_WIND (frame, - unify_create_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - } - return 0; - } - inode = local->loc1.inode; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, fd, - inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - } - return 0; -} - -/** - * unify_create_lookup_cbk - - */ -int32_t -unify_create_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - int32_t callcnt = 0; - int16_t index = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - local->failed = 1; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - local->list[local->index++] = (int16_t)(long)cookie; - if (NS(this) == priv->xl_array[(long)cookie]) { - local->ia_ino = buf->ia_ino; - } else { - local->stbuf = *buf; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - int16_t *list = local->list; - int16_t file_list[3] = {0,}; - local->op_ret = -1; - - local->list [local->index] = -1; - file_list[0] = list[0]; - file_list[1] = list[1]; - file_list[2] = -1; - - local->stbuf.ia_ino = local->ia_ino; - /* TODO: log on failure */ - inode_ctx_put (local->loc1.inode, this, - (uint64_t)(long)local->list); - - if (local->index != 2) { - /* Lookup failed, can't do open */ - gf_log (this->name, GF_LOG_ERROR, - "%s: present on %d nodes", - local->loc1.path, local->index); - file_list[0] = priv->child_count; - for (index = 0; list[index] != -1; index++) { - gf_log (this->name, GF_LOG_ERROR, - "%s: found on %s", local->loc1.path, - priv->xl_array[list[index]]->name); - if (list[index] != priv->child_count) - file_list[1] = list[index]; - } - - if (local->index < 2) { - unify_local_wipe (local); - gf_log (this->name, GF_LOG_ERROR, - "returning EIO as file found on " - "only one node"); - STACK_UNWIND (frame, -1, EIO, - local->fd, inode, NULL, - NULL, NULL); - return 0; - } - } - /* Everything is perfect :) */ - local->call_count = 2; - - for (index = 0; file_list[index] != -1; index++) { - char need_break = (file_list[index+1] == -1); - STACK_WIND_COOKIE (frame, - unify_create_open_cbk, - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]]->fops->open, - &local->loc1, - local->flags, - local->fd, 0); - if (need_break) - break; - } - } - - return 0; -} - - -/** - * unify_create_cbk - - */ -int32_t -unify_create_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - int ret = 0; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - inode_t *tmp_inode = NULL; - - if (op_ret == -1) { - /* send unlink () on Namespace */ - local->op_errno = op_errno; - local->op_ret = -1; - local->call_count = 1; - gf_log (this->name, GF_LOG_ERROR, - "create failed on %s (file %s, error %s), " - "sending unlink to namespace", - prev_frame->this->name, - local->loc1.path, strerror (op_errno)); - - STACK_WIND (frame, - unify_create_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - local->stbuf = *buf; - /* Just inode number should be from NS node */ - local->stbuf.ia_ino = local->ia_ino; - - /* TODO: log on failure */ - ret = fd_ctx_set (fd, this, (uint64_t)(long)prev_frame->this); - } - - tmp_inode = local->loc1.inode; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd, - tmp_inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - - return 0; -} - -/** - * unify_ns_create_cbk - - * - */ -int32_t -unify_ns_create_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - struct sched_ops *sched_ops = NULL; - xlator_t *sched_xl = NULL; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t *list = NULL; - int16_t index = 0; - - if (op_ret == -1) { - /* No need to send create request to other servers, as - namespace action failed. Handle exclusive create here. */ - if ((op_errno != EEXIST) || - ((op_errno == EEXIST) && - ((local->flags & O_EXCL) == O_EXCL))) { - /* If its just a create call without O_EXCL, - don't do this */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s): %s", - local->loc1.path, strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent); - return 0; - } - } - - if (op_ret >= 0) { - /* Get the inode number from the NS node */ - local->ia_ino = buf->ia_ino; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - local->op_ret = -1; - - /* Start the mapping list */ - list = GF_CALLOC (1, sizeof (int16_t) * 3, - gf_unify_mt_int16_t); - ERR_ABORT (list); - inode_ctx_put (inode, this, (uint64_t)(long)list); - list[0] = priv->child_count; - list[2] = -1; - - /* This means, file doesn't exist anywhere in the Filesystem */ - sched_ops = priv->sched_ops; - - /* Send create request to the scheduled node now */ - sched_xl = sched_ops->schedule (this, local->loc1.path); - if (sched_xl == NULL) - { - /* send unlink () on Namespace */ - local->op_errno = ENOTCONN; - local->op_ret = -1; - local->call_count = 1; - gf_log (this->name, GF_LOG_ERROR, - "no node online to schedule create:(file %s) " - "sending unlink to namespace", - (local->loc1.path)?local->loc1.path:""); - - STACK_WIND (frame, - unify_create_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - for (index = 0; index < priv->child_count; index++) - if (sched_xl == priv->xl_array[index]) - break; - list[1] = index; - - STACK_WIND (frame, unify_create_cbk, - sched_xl, sched_xl->fops->create, - &local->loc1, local->flags, local->mode, fd); - } else { - /* File already exists, and there is no O_EXCL flag */ - - gf_log (this->name, GF_LOG_DEBUG, - "File(%s) already exists on namespace, sending " - "open instead", local->loc1.path); - - local->list = GF_CALLOC (1, sizeof (int16_t) * 3, - gf_unify_mt_int16_t); - ERR_ABORT (local->list); - local->call_count = priv->child_count + 1; - local->op_ret = -1; - for (index = 0; index <= priv->child_count; index++) { - /* Send lookup() to all nodes including namespace */ - STACK_WIND_COOKIE (frame, - unify_create_lookup_cbk, - (void *)(long)index, - priv->xl_array[index], - priv->xl_array[index]->fops->lookup, - &local->loc1, - NULL); - } - } - return 0; -} - -/** - * unify_create - create a file in global namespace first, so other - * clients can see them. Create the file in storage nodes in background. - */ -int32_t -unify_create (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - mode_t mode, - fd_t *fd) -{ - unify_local_t *local = NULL; - - /* Initialization */ - INIT_LOCAL (frame, local); - local->mode = mode; - local->flags = flags; - local->fd = fd; - - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, fd, loc->inode, NULL, - NULL, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_create_cbk, - NS(this), - NS(this)->fops->create, - loc, - flags | O_EXCL, - mode, - fd); - - return 0; -} - - -/** - * unify_opendir_cbk - - */ -int32_t -unify_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - STACK_UNWIND (frame, op_ret, op_errno, fd); - - return 0; -} - -/** - * unify_opendir - - */ -int32_t -unify_opendir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - fd_t *fd) -{ - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - STACK_WIND (frame, unify_opendir_cbk, - NS(this), NS(this)->fops->opendir, loc, fd); - - return 0; -} - - -int32_t -unify_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *statpre, - struct iatt *statpost) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s(): child(%s): path(%s): %s", - gf_fop_list[frame->root->op], - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - - local->op_errno = op_errno; - if ((op_errno == ENOENT) && priv->optimist) - local->op_ret = 0; - } - - if (op_ret >= 0) { - local->op_ret = 0; - - if (NS (this) == prev_frame->this) { - local->ia_ino = statpost->ia_ino; - /* If the entry is directory, get the stat - from NS node */ - if (IA_ISDIR (statpost->ia_type) || - !local->stpost.ia_blksize) { - local->stpre = *statpre; - local->stpost = *statpost; - } - } - - if ((!IA_ISDIR (statpost->ia_type)) && - (NS (this) != prev_frame->this)) { - /* If file, take the stat info from Storage - node. */ - local->stpre = *statpre; - local->stpost = *statpost; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - /* If the inode number is not filled, operation should - fail */ - if (!local->ia_ino) - local->op_ret = -1; - - local->stpre.ia_ino = local->ia_ino; - local->stpost.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stpre, &local->stpost); - } - - return 0; -} - - -int32_t -unify_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int32_t index = 0; - int32_t callcnt = 0; - uint64_t tmp_list = 0; - - if (!(loc && loc->inode)) { - STACK_UNWIND (frame, -1, EINVAL, NULL, NULL); - return 0; - } - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - - if (IA_ISDIR (loc->inode->ia_type)) { - local->call_count = 1; - - STACK_WIND (frame, - unify_setattr_cbk, - NS (this), - NS (this)->fops->setattr, - loc, stbuf, valid); - } else { - inode_ctx_get (loc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - for (index = 0; local->list[index] != -1; index++) { - local->call_count++; - callcnt++; - } - - for (index = 0; local->list[index] != -1; index++) { - STACK_WIND (frame, - unify_setattr_cbk, - priv->xl_array[local->list[index]], - priv->xl_array[local->list[index]]->fops->setattr, - loc, stbuf, valid); - - if (!--callcnt) - break; - } - } - - return 0; -} - - -int32_t -unify_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid) -{ - unify_local_t *local = NULL; - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd); - - /* Initialization */ - INIT_LOCAL (frame, local); - - if (!fd_ctx_get (fd, this, &tmp_child)) { - /* If its set, then its file */ - child = (xlator_t *)(long)tmp_child; - - local->call_count = 2; - - STACK_WIND (frame, unify_setattr_cbk, child, - child->fops->fsetattr, fd, stbuf, valid); - - STACK_WIND (frame, unify_setattr_cbk, NS(this), - NS(this)->fops->fsetattr, fd, stbuf, valid); - } else { - local->call_count = 1; - - STACK_WIND (frame, unify_setattr_cbk, - NS(this), NS(this)->fops->fsetattr, - fd, stbuf, valid); - } - - return 0; -} - - -/** - * unify_truncate_cbk - - */ -int32_t -unify_truncate_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - local->op_errno = op_errno; - if (!((op_errno == ENOENT) && priv->optimist)) - local->op_ret = -1; - } - - if (op_ret >= 0) { - if (NS (this) == prev_frame->this) { - local->ia_ino = postbuf->ia_ino; - /* If the entry is directory, get the - stat from NS node */ - if (IA_ISDIR (postbuf->ia_type) || - !local->stbuf.ia_blksize) { - local->stbuf = *prebuf; - local->poststbuf = *postbuf; - } - } - - if ((!IA_ISDIR (postbuf->ia_type)) && - (NS (this) != prev_frame->this)) { - /* If file, take the stat info from - Storage node. */ - local->stbuf = *prebuf; - local->poststbuf = *postbuf; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->ia_ino) { - local->stbuf.ia_ino = local->ia_ino; - local->poststbuf.ia_ino = local->ia_ino; - } else { - local->op_ret = -1; - } - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf, &local->poststbuf); - } - - return 0; -} - - -/** - * unify_truncate - - */ -int32_t -unify_truncate (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - off_t offset) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int32_t index = 0; - int32_t callcnt = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - local->ia_ino = loc->inode->ino; - - if (IA_ISDIR (loc->inode->ia_type)) { - local->call_count = 1; - - STACK_WIND (frame, - unify_truncate_cbk, - NS(this), - NS(this)->fops->truncate, - loc, - 0); - } else { - local->op_ret = 0; - inode_ctx_get (loc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - for (index = 0; local->list[index] != -1; index++) { - local->call_count++; - callcnt++; - } - - /* Don't send offset to NS truncate */ - STACK_WIND (frame, unify_truncate_cbk, NS(this), - NS(this)->fops->truncate, loc, 0); - callcnt--; - - for (index = 0; local->list[index] != -1; index++) { - if (NS(this) != priv->xl_array[local->list[index]]) { - STACK_WIND (frame, - unify_truncate_cbk, - priv->xl_array[local->list[index]], - priv->xl_array[local->list[index]]->fops->truncate, - loc, - offset); - if (!--callcnt) - break; - } - } - } - - return 0; -} - -/** - * unify_readlink_cbk - - */ -int32_t -unify_readlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - const char *path, - struct iatt *sbuf) -{ - STACK_UNWIND (frame, op_ret, op_errno, path, sbuf); - return 0; -} - -/** - * unify_readlink - Read the link only from the storage node. - */ -int32_t -unify_readlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - size_t size) -{ - unify_private_t *priv = this->private; - int32_t entry_count = 0; - int16_t *list = NULL; - int16_t index = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) - entry_count++; - - if (entry_count >= 2) { - for (index = 0; list[index] != -1; index++) { - if (priv->xl_array[list[index]] != NS(this)) { - STACK_WIND (frame, - unify_readlink_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->readlink, - loc, - size); - break; - } - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "returning ENOENT, no softlink files found " - "on storage node"); - STACK_UNWIND (frame, -1, ENOENT, NULL); - } - - return 0; -} - - -/** - * unify_unlink_cbk - - */ -int32_t -unify_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == 0 || ((op_errno == ENOENT) && priv->optimist)) - local->op_ret = 0; - if (op_ret == -1) - local->op_errno = op_errno; - - if (((call_frame_t *)cookie)->this == NS(this)) { - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->oldpreparent, &local->oldpostparent); - } - - return 0; -} - - -/** - * unify_unlink - - */ -int32_t -unify_unlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - unify_private_t *priv = this->private; - unify_local_t *local = NULL; - int16_t *list = NULL; - int16_t index = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) - local->call_count++; - - if (local->call_count) { - for (index = 0; list[index] != -1; index++) { - char need_break = (list[index+1] == -1); - STACK_WIND (frame, - unify_unlink_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->unlink, - loc); - if (need_break) - break; - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "%s: returning ENOENT", loc->path); - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - } - - return 0; -} - - -/** - * unify_readv_cbk - - */ -int32_t -unify_readv_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iovec *vector, - int32_t count, - struct iatt *stbuf, - struct iobref *iobref) -{ - STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf, iobref); - return 0; -} - -/** - * unify_readv - - */ -int32_t -unify_readv (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, - unify_readv_cbk, - child, - child->fops->readv, - fd, - size, - offset); - - - return 0; -} - -/** - * unify_writev_cbk - - */ -int32_t -unify_writev_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - unify_local_t *local = NULL; - - local = frame->local; - - local->stbuf = *prebuf; - local->stbuf.ia_ino = local->ia_ino; - - local->poststbuf = *postbuf; - local->poststbuf.ia_ino = local->ia_ino; - - STACK_UNWIND (frame, op_ret, op_errno, - &local->stbuf, &local->poststbuf); - return 0; -} - -/** - * unify_writev - - */ -int32_t -unify_writev (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iovec *vector, - int32_t count, - off_t off, - struct iobref *iobref) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - unify_local_t *local = NULL; - - INIT_LOCAL (frame, local); - local->ia_ino = fd->inode->ino; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, - unify_writev_cbk, - child, - child->fops->writev, - fd, - vector, - count, - off, - iobref); - - return 0; -} - -/** - * unify_ftruncate - - */ -int32_t -unify_ftruncate (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - off_t offset) -{ - xlator_t *child = NULL; - unify_local_t *local = NULL; - uint64_t tmp_child = 0; - - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(fd); - - /* Initialization */ - INIT_LOCAL (frame, local); - local->op_ret = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - local->call_count = 2; - - STACK_WIND (frame, unify_truncate_cbk, - child, child->fops->ftruncate, - fd, offset); - - STACK_WIND (frame, unify_truncate_cbk, - NS(this), NS(this)->fops->ftruncate, - fd, 0); - - return 0; -} - - -/** - * unify_flush_cbk - - */ -int32_t -unify_flush_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_flush - - */ -int32_t -unify_flush (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_flush_cbk, child, - child->fops->flush, fd); - - return 0; -} - - -/** - * unify_fsync_cbk - - */ -int32_t -unify_fsync_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf); - return 0; -} - -/** - * unify_fsync - - */ -int32_t -unify_fsync (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_fsync_cbk, child, - child->fops->fsync, fd, flags); - - return 0; -} - -/** - * unify_fstat - Send fstat FOP to Namespace only if its directory, and to - * both namespace and the storage node if its a file. - */ -int32_t -unify_fstat (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - unify_local_t *local = NULL; - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd); - - INIT_LOCAL (frame, local); - local->ia_ino = fd->inode->ino; - - if (!fd_ctx_get (fd, this, &tmp_child)) { - /* If its set, then its file */ - child = (xlator_t *)(long)tmp_child; - local->call_count = 2; - - STACK_WIND (frame, unify_buf_cbk, child, - child->fops->fstat, fd); - - STACK_WIND (frame, unify_buf_cbk, NS(this), - NS(this)->fops->fstat, fd); - - } else { - /* this is an directory */ - local->call_count = 1; - STACK_WIND (frame, unify_buf_cbk, NS(this), - NS(this)->fops->fstat, fd); - } - - return 0; -} - -/** - * unify_getdents_cbk - - */ -int32_t -unify_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - STACK_UNWIND (frame, op_ret, op_errno, entry, count); - return 0; -} - -/** - * unify_getdents - send the FOP request to all the nodes. - */ -int32_t -unify_getdents (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset, - int32_t flag) -{ - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); - - STACK_WIND (frame, unify_getdents_cbk, NS(this), - NS(this)->fops->getdents, fd, size, offset, flag); - - return 0; -} - - -/** - * unify_readdir_cbk - - */ -int32_t -unify_readdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - gf_dirent_t *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - - return 0; -} - -/** - * unify_readdir - send the FOP request to all the nodes. - */ -int32_t -unify_readdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset) -{ - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); - - STACK_WIND (frame, unify_readdir_cbk, NS(this), - NS(this)->fops->readdir, fd, size, offset); - - return 0; -} - - -int32_t -unify_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - - return 0; -} - - -int32_t -unify_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) -{ - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); - - STACK_WIND (frame, unify_readdirp_cbk, NS(this), - NS(this)->fops->readdirp, fd, size, offset); - - return 0; -} - - -/** - * unify_fsyncdir_cbk - - */ -int32_t -unify_fsyncdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - - return 0; -} - -/** - * unify_fsyncdir - - */ -int32_t -unify_fsyncdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags) -{ - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); - - STACK_WIND (frame, unify_fsyncdir_cbk, - NS(this), NS(this)->fops->fsyncdir, fd, flags); - - return 0; -} - -/** - * unify_lk_cbk - UNWIND frame with the proper return arguments. - */ -int32_t -unify_lk_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct gf_flock *lock) -{ - STACK_UNWIND (frame, op_ret, op_errno, lock); - return 0; -} - -/** - * unify_lk - Send it to all the storage nodes, (should be 1) which has file. - */ -int32_t -unify_lk (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t cmd, - struct gf_flock *lock) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_lk_cbk, child, - child->fops->lk, fd, cmd, lock); - - return 0; -} - - -int32_t -unify_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno); - -static int32_t -unify_setxattr_file_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - unify_private_t *private = this->private; - unify_local_t *local = frame->local; - xlator_t *sched_xl = NULL; - struct sched_ops *sched_ops = NULL; - - if (op_ret == -1) { - if (!ENOTSUP) - gf_log (this->name, GF_LOG_ERROR, - "setxattr with XATTR_CREATE on ns: " - "path(%s) key(%s): %s", - local->loc1.path, local->name, - strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno); - return 0; - } - - LOCK (&frame->lock); - { - local->failed = 0; - local->op_ret = 0; - local->op_errno = 0; - local->call_count = 1; - } - UNLOCK (&frame->lock); - - /* schedule XATTR_CREATE on one of the child node */ - sched_ops = private->sched_ops; - - /* Send create request to the scheduled node now */ - sched_xl = sched_ops->schedule (this, local->name); - if (!sched_xl) { - STACK_UNWIND (frame, -1, ENOTCONN); - return 0; - } - - STACK_WIND (frame, - unify_setxattr_cbk, - sched_xl, - sched_xl->fops->setxattr, - &local->loc1, - local->dict, - local->flags); - return 0; -} - -/** - * unify_setxattr_cbk - When all the child nodes return, UNWIND frame. - */ -int32_t -unify_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - dict_t *dict = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, (((op_errno == ENOENT) || - (op_errno == ENOTSUP))? - GF_LOG_DEBUG : GF_LOG_ERROR), - "child(%s): path(%s): %s", - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - if (local->failed == -1) { - local->failed = 1; - } - local->op_errno = op_errno; - } else { - local->failed = 0; - local->op_ret = op_ret; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed && local->name && - ZR_FILE_CONTENT_REQUEST(local->name)) { - dict = get_new_dict (); - dict_set (dict, local->dict->members_list->key, - data_from_dynptr(NULL, 0)); - dict_ref (dict); - - local->call_count = 1; - - STACK_WIND (frame, - unify_setxattr_file_cbk, - NS(this), - NS(this)->fops->setxattr, - &local->loc1, - dict, - XATTR_CREATE); - - dict_unref (dict); - return 0; - } - - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno); - } - - return 0; -} - -/** - * unify_sexattr - This function should be sent to all the storage nodes, - * which contains the file, (excluding namespace). - */ -int32_t -unify_setxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *dict, - int32_t flags) -{ - unify_private_t *priv = this->private; - unify_local_t *local = NULL; - int16_t *list = NULL; - int16_t index = 0; - int32_t call_count = 0; - uint64_t tmp_list = 0; - data_pair_t *trav = dict->members_list; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - local->failed = -1; - loc_copy (&local->loc1, loc); - - if (IA_ISDIR (loc->inode->ia_type)) { - - if (trav && trav->key && ZR_FILE_CONTENT_REQUEST(trav->key)) { - /* direct the storage xlators to change file - content only if file exists */ - local->flags = flags; - local->dict = dict; - local->name = gf_strdup (trav->key); - flags |= XATTR_REPLACE; - } - - local->call_count = priv->child_count; - for (index = 0; index < priv->child_count; index++) { - STACK_WIND (frame, - unify_setxattr_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->setxattr, - loc, dict, flags); - } - return 0; - } - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - local->call_count++; - call_count++; - } - } - - if (local->call_count) { - for (index = 0; list[index] != -1; index++) { - if (priv->xl_array[list[index]] != NS(this)) { - STACK_WIND (frame, - unify_setxattr_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->setxattr, - loc, - dict, - flags); - if (!--call_count) - break; - } - } - return 0; - } - - /* No entry in storage nodes */ - gf_log (this->name, GF_LOG_DEBUG, - "returning ENOENT, file not found on storage node."); - STACK_UNWIND (frame, -1, ENOENT); - - return 0; -} - - -/** - * unify_getxattr_cbk - This function is called from only one child, so, no - * need of any lock or anything else, just send it to above layer - */ -int32_t -unify_getxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *value) -{ - int32_t callcnt = 0; - dict_t *local_value = NULL; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, - (((op_errno == ENOENT) || - (op_errno == ENODATA) || - (op_errno == ENOTSUP)) ? - GF_LOG_DEBUG : GF_LOG_ERROR), - "child(%s): path(%s): %s", - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - } else { - if (!local->dict) - local->dict = dict_ref (value); - local->op_ret = op_ret; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local_value = local->dict; - local->dict = NULL; - - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local_value); - - if (local_value) - dict_unref (local_value); - } - - return 0; -} - - -/** - * unify_getxattr - This FOP is sent to only the storage node. - */ -int32_t -unify_getxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - unify_private_t *priv = this->private; - int16_t *list = NULL; - int16_t index = 0; - int16_t count = 0; - unify_local_t *local = NULL; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - INIT_LOCAL (frame, local); - - if (IA_ISDIR (loc->inode->ia_type)) { - local->call_count = priv->child_count; - for (index = 0; index < priv->child_count; index++) - STACK_WIND (frame, - unify_getxattr_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->getxattr, - loc, - name); - return 0; - } - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - local->call_count++; - count++; - } - } - - if (count) { - for (index = 0; list[index] != -1; index++) { - if (priv->xl_array[list[index]] != NS(this)) { - STACK_WIND (frame, - unify_getxattr_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->getxattr, - loc, - name); - if (!--count) - break; - } - } - } else { - dict_t *tmp_dict = get_new_dict (); - gf_log (this->name, GF_LOG_DEBUG, - "%s: returning ENODATA, no file found on storage node", - loc->path); - STACK_UNWIND (frame, -1, ENODATA, tmp_dict); - dict_destroy (tmp_dict); - } - - return 0; -} - -/** - * unify_removexattr_cbk - Wait till all the child node returns the call - * and then UNWIND to above layer. - */ -int32_t -unify_removexattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == -1) { - local->op_errno = op_errno; - if (op_errno != ENOTSUP) - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - prev_frame->this->name, - local->loc1.path, strerror (op_errno)); - } else { - local->op_ret = op_ret; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - STACK_UNWIND (frame, local->op_ret, local->op_errno); - } - - return 0; -} - -/** - * unify_removexattr - Send it to all the child nodes which has the files. - */ -int32_t -unify_removexattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - unify_private_t *priv = this->private; - unify_local_t *local = NULL; - int16_t *list = NULL; - int16_t index = 0; - int32_t call_count = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - - if (IA_ISDIR (loc->inode->ia_type)) { - local->call_count = priv->child_count; - for (index = 0; index < priv->child_count; index++) - STACK_WIND (frame, - unify_removexattr_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->removexattr, - loc, - name); - - return 0; - } - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - local->call_count++; - call_count++; - } - } - - if (local->call_count) { - for (index = 0; list[index] != -1; index++) { - if (priv->xl_array[list[index]] != NS(this)) { - STACK_WIND (frame, - unify_removexattr_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->removexattr, - loc, - name); - if (!--call_count) - break; - } - } - return 0; - } - - gf_log (this->name, GF_LOG_DEBUG, - "%s: returning ENOENT, not found on storage node.", loc->path); - STACK_UNWIND (frame, -1, ENOENT); - - return 0; -} - - -int32_t -unify_mknod_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - - if (op_ret == -1) - gf_log (this->name, GF_LOG_ERROR, - "%s: %s", local->loc1.path, strerror (op_errno)); - - unify_local_wipe (local); - /* No log required here as this -1 is for mknod call */ - STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL); - return 0; -} - -/** - * unify_mknod_cbk - - */ -int32_t -unify_mknod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "mknod failed on storage node, sending unlink to " - "namespace"); - local->op_errno = op_errno; - STACK_WIND (frame, - unify_mknod_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - return 0; - } - - local->stbuf = *buf; - local->stbuf.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - return 0; -} - -/** - * unify_ns_mknod_cbk - - */ -int32_t -unify_ns_mknod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - struct sched_ops *sched_ops = NULL; - xlator_t *sched_xl = NULL; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t *list = NULL; - int16_t index = 0; - call_frame_t *prev_frame = cookie; - - if (op_ret == -1) { - /* No need to send mknod request to other servers, - * as namespace action failed - */ - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - prev_frame->this->name, local->loc1.path, - strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, buf, - preparent, postparent); - return 0; - } - - /* Create one inode for this entry */ - local->op_ret = 0; - local->stbuf = *buf; - local->ia_ino = buf->ia_ino; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - list = GF_CALLOC (1, sizeof (int16_t) * 3, gf_unify_mt_int16_t); - ERR_ABORT (list); - list[0] = priv->child_count; - list[2] = -1; - inode_ctx_put (inode, this, (uint64_t)(long)list); - - sched_ops = priv->sched_ops; - - /* Send mknod request to scheduled node now */ - sched_xl = sched_ops->schedule (this, local->loc1.path); - if (!sched_xl) { - gf_log (this->name, GF_LOG_ERROR, - "mknod failed on storage node, no node online " - "at the moment, sending unlink to NS"); - local->op_errno = ENOTCONN; - STACK_WIND (frame, - unify_mknod_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - for (index = 0; index < priv->child_count; index++) - if (sched_xl == priv->xl_array[index]) - break; - list[1] = index; - - STACK_WIND (frame, unify_mknod_cbk, - sched_xl, sched_xl->fops->mknod, - &local->loc1, local->mode, local->dev); - - return 0; -} - -/** - * unify_mknod - Create a device on namespace first, and later create on - * the storage node. - */ -int32_t -unify_mknod (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode, - dev_t rdev) -{ - unify_local_t *local = NULL; - - /* Initialization */ - INIT_LOCAL (frame, local); - local->mode = mode; - local->dev = rdev; - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_mknod_cbk, - NS(this), - NS(this)->fops->mknod, - loc, - mode, - rdev); - - return 0; -} - -int32_t -unify_symlink_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - if (op_ret == -1) - gf_log (this->name, GF_LOG_ERROR, - "%s: %s", local->loc1.path, strerror (op_errno)); - - unify_local_wipe (local); - STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL); - return 0; -} - -/** - * unify_symlink_cbk - - */ -int32_t -unify_symlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - - if (op_ret == -1) { - /* Symlink on storage node failed, hence send unlink - to the NS node */ - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_ERROR, - "symlink on storage node failed, sending unlink " - "to namespace"); - - STACK_WIND (frame, - unify_symlink_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - local->stbuf = *buf; - local->stbuf.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - - return 0; -} - -/** - * unify_ns_symlink_cbk - - */ -int32_t -unify_ns_symlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - - struct sched_ops *sched_ops = NULL; - xlator_t *sched_xl = NULL; - int16_t *list = NULL; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t index = 0; - - if (op_ret == -1) { - /* No need to send symlink request to other servers, - * as namespace action failed - */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s): %s", - local->loc1.path, strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, NULL, buf, - preparent, postparent); - return 0; - } - - /* Create one inode for this entry */ - local->op_ret = 0; - local->ia_ino = buf->ia_ino; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - /* Start the mapping list */ - - list = GF_CALLOC (1, sizeof (int16_t) * 3, gf_unify_mt_int16_t); - ERR_ABORT (list); - list[0] = priv->child_count; //namespace's index - list[2] = -1; - inode_ctx_put (inode, this, (uint64_t)(long)list); - - sched_ops = priv->sched_ops; - - /* Send symlink request to all the nodes now */ - sched_xl = sched_ops->schedule (this, local->loc1.path); - if (!sched_xl) { - /* Symlink on storage node failed, hence send unlink - to the NS node */ - local->op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_ERROR, - "symlink on storage node failed, no node online, " - "sending unlink to namespace"); - - STACK_WIND (frame, - unify_symlink_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - for (index = 0; index < priv->child_count; index++) - if (sched_xl == priv->xl_array[index]) - break; - list[1] = index; - - STACK_WIND (frame, - unify_symlink_cbk, - sched_xl, - sched_xl->fops->symlink, - local->name, - &local->loc1); - - return 0; -} - -/** - * unify_symlink - - */ -int32_t -unify_symlink (call_frame_t *frame, - xlator_t *this, - const char *linkpath, - loc_t *loc) -{ - unify_local_t *local = NULL; - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - local->name = gf_strdup (linkpath); - - if ((local->name == NULL) || - (local->loc1.path == NULL)) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_symlink_cbk, - NS(this), - NS(this)->fops->symlink, - linkpath, - loc); - - return 0; -} - - -int32_t -unify_rename_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s -> %s): %s", - prev_frame->this->name, - local->loc1.path, local->loc2.path, - strerror (op_errno)); - - } - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local->stbuf.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf); - } - return 0; -} - -int32_t -unify_ns_rename_undo_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf, - struct iatt *preoldparent, - struct iatt *postoldparent, - struct iatt *prenewparent, - struct iatt *postnewparent) -{ - unify_local_t *local = frame->local; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s -> %s): %s", - local->loc1.path, local->loc2.path, - strerror (op_errno)); - } - - local->stbuf.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, &local->stbuf); - return 0; -} - -int32_t -unify_rename_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf, - struct iatt *preoldparent, - struct iatt *postoldparent, - struct iatt *prenewparent, - struct iatt *postnewparent) -{ - int32_t index = 0; - int32_t callcnt = 0; - int16_t *list = NULL; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret >= 0) { - if (!IA_ISDIR (buf->ia_type)) - local->stbuf = *buf; - local->op_ret = op_ret; - } else { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s -> %s): %s", - prev_frame->this->name, - local->loc1.path, local->loc2.path, - strerror (op_errno)); - local->op_errno = op_errno; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local->stbuf.ia_ino = local->ia_ino; - if (IA_ISDIR (local->loc1.inode->ia_type)) { - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf, &local->oldpreparent, - &local->oldpostparent, &local->newpreparent, - &local->newpostparent); - return 0; - } - - if (local->op_ret == -1) { - /* TODO: check this logic */ - - /* Rename failed in storage node, successful on NS, - * hence, rename back the entries in NS */ - /* NOTE: this will be done only if the destination - * doesn't exists, if the destination exists, the - * job of correcting NS is left to self-heal - */ - if (!local->index) { - loc_t tmp_oldloc = { - /* its actual 'newloc->path' */ - .path = local->loc2.path, - .inode = local->loc1.inode, - .parent = local->loc2.parent - }; - - loc_t tmp_newloc = { - /* Actual 'oldloc->path' */ - .path = local->loc1.path, - .parent = local->loc1.parent - }; - - gf_log (this->name, GF_LOG_ERROR, - "rename succussful on namespace, on " - "stroage node failed, reverting back"); - - STACK_WIND (frame, - unify_ns_rename_undo_cbk, - NS(this), - NS(this)->fops->rename, - &tmp_oldloc, - &tmp_newloc); - return 0; - } - } else { - /* Rename successful on storage nodes */ - - int32_t idx = 0; - int16_t *tmp_list = NULL; - uint64_t tmp_list_int64 = 0; - if (local->loc2.inode) { - inode_ctx_get (local->loc2.inode, - this, &tmp_list_int64); - list = (int16_t *)(long)tmp_list_int64; - - } - - if (list) { - for (index = 0; list[index] != -1; index++); - tmp_list = GF_CALLOC (1, index * 2, - gf_unify_mt_int16_t); - memcpy (tmp_list, list, index * 2); - - for (index = 0; list[index] != -1; index++) { - /* TODO: Check this logic. */ - /* If the destination file exists in - * the same storage node where we sent - * 'rename' call, no need to send - * unlink - */ - for (idx = 0; - local->list[idx] != -1; idx++) { - if (tmp_list[index] == local->list[idx]) { - tmp_list[index] = priv->child_count; - continue; - } - } - - if (NS(this) != priv->xl_array[tmp_list[index]]) { - local->call_count++; - callcnt++; - } - } - - if (local->call_count) { - if (callcnt > 1) - gf_log (this->name, - GF_LOG_ERROR, - "%s->%s: more (%d) " - "subvolumes have the " - "newloc entry", - local->loc1.path, - local->loc2.path, - callcnt); - - for (index=0; - tmp_list[index] != -1; index++) { - if (NS(this) != priv->xl_array[tmp_list[index]]) { - STACK_WIND (frame, - unify_rename_unlink_cbk, - priv->xl_array[tmp_list[index]], - priv->xl_array[tmp_list[index]]->fops->unlink, - &local->loc2); - if (!--callcnt) - break; - } - } - - GF_FREE (tmp_list); - return 0; - } - if (tmp_list) - GF_FREE (tmp_list); - } - } - - /* Need not send 'unlink' to storage node */ - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, - local->op_errno, &local->stbuf, - &local->oldpreparent, &local->oldpostparent, - &local->newpreparent, &local->newpostparent); - } - - return 0; -} - -int32_t -unify_ns_rename_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf, - struct iatt *preoldparent, - struct iatt *postoldparent, - struct iatt *prenewparent, - struct iatt *postnewparent) -{ - int32_t index = 0; - int32_t callcnt = 0; - int16_t *list = NULL; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - if (op_ret == -1) { - /* Free local->new_inode */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s -> %s): %s", - local->loc1.path, local->loc2.path, - strerror (op_errno)); - - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, buf, - preoldparent, postoldparent, - prenewparent, postnewparent); - return 0; - } - - local->stbuf = *buf; - local->ia_ino = buf->ia_ino; - - local->oldpreparent = *preoldparent; - local->oldpostparent = *postoldparent; - local->newpreparent = *prenewparent; - local->newpostparent = *postnewparent; - - /* Everything is fine. */ - if (IA_ISDIR (buf->ia_type)) { - local->call_count = priv->child_count; - for (index=0; index < priv->child_count; index++) { - STACK_WIND (frame, - unify_rename_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->rename, - &local->loc1, - &local->loc2); - } - - return 0; - } - - local->call_count = 0; - /* send rename */ - list = local->list; - for (index=0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - local->call_count++; - callcnt++; - } - } - - if (local->call_count) { - for (index=0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - STACK_WIND (frame, - unify_rename_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->rename, - &local->loc1, - &local->loc2); - if (!--callcnt) - break; - } - } - } else { - /* file doesn't seem to be present in storage nodes */ - gf_log (this->name, GF_LOG_CRITICAL, - "CRITICAL: source file not in storage node, " - "rename successful on namespace :O"); - unify_local_wipe (local); - STACK_UNWIND (frame, -1, EIO, NULL, - NULL, NULL, /* preoldparent, postoldparent */ - NULL, NULL); /* prenewparent, postnewparent */ - } - return 0; -} - - -/** - * unify_rename - One of the tricky function. The deadliest of all :O - */ -int32_t -unify_rename (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - unify_local_t *local = NULL; - uint64_t tmp_list = 0; - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, oldloc); - loc_copy (&local->loc2, newloc); - - if ((local->loc1.path == NULL) || - (local->loc2.path == NULL)) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, - NULL, NULL, /* preoldparent, postoldparent */ - NULL, NULL); /* prenewparent, postnewparent */ - return 0; - } - - inode_ctx_get (oldloc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - STACK_WIND (frame, - unify_ns_rename_cbk, - NS(this), - NS(this)->fops->rename, - oldloc, - newloc); - return 0; -} - -/** - * unify_link_cbk - - */ -int32_t -unify_link_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - - if (op_ret >= 0) - local->stbuf = *buf; - local->stbuf.ia_ino = local->ia_ino; - - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - - return 0; -} - -/** - * unify_ns_link_cbk - - */ -int32_t -unify_ns_link_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - int16_t *list = local->list; - int16_t index = 0; - - if (op_ret == -1) { - /* No need to send link request to other servers, - * as namespace action failed - */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s -> %s): %s", - local->loc1.path, local->loc2.path, - strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, buf, - preparent, postparent); - return 0; - } - - /* Update inode for this entry */ - local->op_ret = 0; - local->ia_ino = buf->ia_ino; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - /* Send link request to the node now */ - for (index = 0; list[index] != -1; index++) { - char need_break = (list[index+1] == -1); - if (priv->xl_array[list[index]] != NS (this)) { - STACK_WIND (frame, - unify_link_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->link, - &local->loc1, - &local->loc2); - break; - } - if (need_break) - break; - } - - return 0; -} - -/** - * unify_link - - */ -int32_t -unify_link (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - unify_local_t *local = NULL; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc); - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (newloc); - - /* Initialization */ - INIT_LOCAL (frame, local); - - loc_copy (&local->loc1, oldloc); - loc_copy (&local->loc2, newloc); - - inode_ctx_get (oldloc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - STACK_WIND (frame, - unify_ns_link_cbk, - NS(this), - NS(this)->fops->link, - oldloc, - newloc); - - return 0; -} - - -/** - * unify_checksum_cbk - - */ -int32_t -unify_checksum_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - uint8_t *fchecksum, - uint8_t *dchecksum) -{ - STACK_UNWIND (frame, op_ret, op_errno, fchecksum, dchecksum); - - return 0; -} - -/** - * unify_checksum - - */ -int32_t -unify_checksum (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flag) -{ - STACK_WIND (frame, - unify_checksum_cbk, - NS(this), - NS(this)->fops->checksum, - loc, - flag); - - return 0; -} - - -/** - * unify_finodelk_cbk - - */ -int -unify_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_finodelk - */ -int -unify_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int cmd, struct gf_flock *flock) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_finodelk_cbk, - child, child->fops->finodelk, - volume, fd, cmd, flock); - - return 0; -} - - - -/** - * unify_fentrylk_cbk - - */ -int -unify_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_fentrylk - */ -int -unify_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) - -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_fentrylk_cbk, - child, child->fops->fentrylk, - volume, fd, basename, cmd, type); - - return 0; -} - - - -/** - * unify_fxattrop_cbk - - */ -int -unify_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) -{ - STACK_UNWIND (frame, op_ret, op_errno, xattr); - return 0; -} - -/** - * unify_fxattrop - */ -int -unify_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_fxattrop_cbk, - child, child->fops->fxattrop, - fd, optype, xattr); - - return 0; -} - - -/** - * unify_inodelk_cbk - - */ -int -unify_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - -/** - * unify_inodelk - */ -int -unify_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int cmd, struct gf_flock *flock) -{ - xlator_t *child = NULL; - - child = unify_loc_subvol (loc, this); - - STACK_WIND (frame, unify_inodelk_cbk, - child, child->fops->inodelk, - volume, loc, cmd, flock); - - return 0; -} - - - -/** - * unify_entrylk_cbk - - */ -int -unify_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_entrylk - */ -int -unify_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) - -{ - xlator_t *child = NULL; - - child = unify_loc_subvol (loc, this); - - STACK_WIND (frame, unify_entrylk_cbk, - child, child->fops->entrylk, - volume, loc, basename, cmd, type); - - return 0; -} - - - -/** - * unify_xattrop_cbk - - */ -int -unify_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) -{ - STACK_UNWIND (frame, op_ret, op_errno, xattr); - return 0; -} - -/** - * unify_xattrop - */ -int -unify_xattrop (call_frame_t *frame, xlator_t *this, - loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr) -{ - xlator_t *child = NULL; - - child = unify_loc_subvol (loc, this); - - STACK_WIND (frame, unify_xattrop_cbk, - child, child->fops->xattrop, - loc, optype, xattr); - - return 0; -} - -int -unify_forget (xlator_t *this, - inode_t *inode) -{ - int16_t *list = NULL; - uint64_t tmp_list = 0; - - if (inode->ia_type && (!IA_ISDIR(inode->ia_type))) { - inode_ctx_get (inode, this, &tmp_list); - if (tmp_list) { - list = (int16_t *)(long)tmp_list; - GF_FREE (list); - } - } - - return 0; -} - -/** - * notify - */ -int32_t -notify (xlator_t *this, - int32_t event, - void *data, - ...) -{ - unify_private_t *priv = this->private; - struct sched_ops *sched = NULL; - - if (!priv) { - return 0; - } - - sched = priv->sched_ops; - if (!sched) { - gf_log (this->name, GF_LOG_CRITICAL, "No scheduler :O"); - raise (SIGTERM); - return 0; - } - if (priv->namespace == data) { - if (event == GF_EVENT_CHILD_UP) { - sched->notify (this, event, data); - } - return 0; - } - - switch (event) - { - case GF_EVENT_CHILD_UP: - { - /* Call scheduler's update () to enable it for scheduling */ - sched->notify (this, event, data); - - LOCK (&priv->lock); - { - /* Increment the inode's generation, which is - used for self_heal */ - ++priv->inode_generation; - ++priv->num_child_up; - } - UNLOCK (&priv->lock); - - if (!priv->is_up) { - default_notify (this, event, data); - priv->is_up = 1; - } - } - break; - case GF_EVENT_CHILD_DOWN: - { - /* Call scheduler's update () to disable the child node - * for scheduling - */ - sched->notify (this, event, data); - LOCK (&priv->lock); - { - --priv->num_child_up; - } - UNLOCK (&priv->lock); - - if (priv->num_child_up == 0) { - /* Send CHILD_DOWN to upper layer */ - default_notify (this, event, data); - priv->is_up = 0; - } - } - break; - - default: - { - default_notify (this, event, data); - } - break; - } - - return 0; -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_unify_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; -} - -/** - * init - This function is called first in the xlator, while initializing. - * All the config file options are checked and appropriate flags are set. - * - * @this - - */ -int32_t -init (xlator_t *this) -{ - int32_t ret = 0; - int32_t count = 0; - data_t *scheduler = NULL; - data_t *data = NULL; - xlator_t *ns_xl = NULL; - xlator_list_t *trav = NULL; - xlator_list_t *xlparent = NULL; - xlator_list_t *parent = NULL; - unify_private_t *_private = NULL; - - - /* Check for number of child nodes, if there is no child nodes, exit */ - if (!this->children) { - gf_log (this->name, GF_LOG_ERROR, - "No child nodes specified. check \"subvolumes \" " - "option in volfile"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - /* Check for 'scheduler' in volume */ - scheduler = dict_get (this->options, "scheduler"); - if (!scheduler) { - gf_log (this->name, GF_LOG_ERROR, - "\"option scheduler <x>\" is missing in volfile"); - return -1; - } - - /* Setting "option namespace <node>" */ - data = dict_get (this->options, "namespace"); - if(!data) { - gf_log (this->name, GF_LOG_CRITICAL, - "namespace option not specified, Exiting"); - return -1; - } - /* Search namespace in the child node, if found, exit */ - trav = this->children; - while (trav) { - if (strcmp (trav->xlator->name, data->data) == 0) - break; - trav = trav->next; - } - if (trav) { - gf_log (this->name, GF_LOG_CRITICAL, - "namespace node used as a subvolume, Exiting"); - return -1; - } - - /* Search for the namespace node, if found, continue */ - ns_xl = this->next; - while (ns_xl) { - if (strcmp (ns_xl->name, data->data) == 0) - break; - ns_xl = ns_xl->next; - } - if (!ns_xl) { - gf_log (this->name, GF_LOG_CRITICAL, - "namespace node not found in volfile, Exiting"); - return -1; - } - - gf_log (this->name, GF_LOG_DEBUG, - "namespace node specified as %s", data->data); - - _private = GF_CALLOC (1, sizeof (*_private), - gf_unify_mt_unify_private_t); - ERR_ABORT (_private); - _private->sched_ops = get_scheduler (this, scheduler->data); - if (!_private->sched_ops) { - gf_log (this->name, GF_LOG_CRITICAL, - "Error while loading scheduler. Exiting"); - GF_FREE (_private); - return -1; - } - - if (ns_xl->parents) { - gf_log (this->name, GF_LOG_CRITICAL, - "Namespace node should not be a child of any other node. Exiting"); - GF_FREE (_private); - return -1; - } - - _private->namespace = ns_xl; - - /* update _private structure */ - { - count = 0; - trav = this->children; - /* Get the number of child count */ - while (trav) { - count++; - trav = trav->next; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Child node count is %d", count); - - _private->child_count = count; - if (count == 1) { - /* TODO: Should I error out here? */ - gf_log (this->name, GF_LOG_CRITICAL, - "WARNING: You have defined only one " - "\"subvolumes\" for unify volume. It may not " - "be the desired config, review your volume " - "volfile. If this is how you are testing it," - " you may hit some performance penalty"); - } - - _private->xl_array = GF_CALLOC (1, - sizeof (xlator_t) * (count + 1), - gf_unify_mt_xlator_t); - ERR_ABORT (_private->xl_array); - - count = 0; - trav = this->children; - while (trav) { - _private->xl_array[count++] = trav->xlator; - trav = trav->next; - } - _private->xl_array[count] = _private->namespace; - - /* self-heal part, start with generation '1' */ - _private->inode_generation = 1; - /* Because, Foreground part is tested well */ - _private->self_heal = ZR_UNIFY_FG_SELF_HEAL; - data = dict_get (this->options, "self-heal"); - if (data) { - if (strcasecmp (data->data, "off") == 0) - _private->self_heal = ZR_UNIFY_SELF_HEAL_OFF; - - if (strcasecmp (data->data, "foreground") == 0) - _private->self_heal = ZR_UNIFY_FG_SELF_HEAL; - - if (strcasecmp (data->data, "background") == 0) - _private->self_heal = ZR_UNIFY_BG_SELF_HEAL; - } - - /* optimist - ask bulde for more about it */ - data = dict_get (this->options, "optimist"); - if (data) { - if (gf_string2boolean (data->data, - &_private->optimist) == -1) { - gf_log (this->name, GF_LOG_ERROR, - "optimist excepts only boolean " - "options"); - } - } - - LOCK_INIT (&_private->lock); - } - - /* Now that everything is fine. */ - this->private = (void *)_private; - { - ret = _private->sched_ops->mem_acct_init (this); - - if (ret == -1) { - return -1; - } - - /* Initialize scheduler, if everything else is successful */ - ret = _private->sched_ops->init (this); - if (ret == -1) { - gf_log (this->name, GF_LOG_CRITICAL, - "Initializing scheduler failed, Exiting"); - GF_FREE (_private); - return -1; - } - - - ret = 0; - - /* This section is required because some fops may look - * for 'xl->parent' variable - */ - xlparent = GF_CALLOC (1, sizeof (*xlparent), - gf_unify_mt_xlator_list_t); - xlparent->xlator = this; - if (!ns_xl->parents) { - ns_xl->parents = xlparent; - } else { - parent = ns_xl->parents; - while (parent->next) - parent = parent->next; - parent->next = xlparent; - } - } - - /* Tell namespace node that init is done */ - xlator_notify (ns_xl, GF_EVENT_PARENT_UP, this); - - return 0; -} - -/** - * fini - Free all the allocated memory - */ -void -fini (xlator_t *this) -{ - unify_private_t *priv = this->private; - priv->sched_ops->fini (this); - this->private = NULL; - LOCK_DESTROY (&priv->lock); - GF_FREE (priv->xl_array); - GF_FREE (priv); - return; -} - - -struct xlator_fops fops = { - .stat = unify_stat, - .readlink = unify_readlink, - .mknod = unify_mknod, - .mkdir = unify_mkdir, - .unlink = unify_unlink, - .rmdir = unify_rmdir, - .symlink = unify_symlink, - .rename = unify_rename, - .link = unify_link, - .truncate = unify_truncate, - .create = unify_create, - .open = unify_open, - .readv = unify_readv, - .writev = unify_writev, - .statfs = unify_statfs, - .flush = unify_flush, - .fsync = unify_fsync, - .setxattr = unify_setxattr, - .getxattr = unify_getxattr, - .removexattr = unify_removexattr, - .opendir = unify_opendir, - .readdir = unify_readdir, - .readdirp = unify_readdirp, - .fsyncdir = unify_fsyncdir, - .access = unify_access, - .ftruncate = unify_ftruncate, - .fstat = unify_fstat, - .lk = unify_lk, - .lookup = unify_lookup, - .getdents = unify_getdents, - .checksum = unify_checksum, - .inodelk = unify_inodelk, - .finodelk = unify_finodelk, - .entrylk = unify_entrylk, - .fentrylk = unify_fentrylk, - .xattrop = unify_xattrop, - .fxattrop = unify_fxattrop, - .setattr = unify_setattr, - .fsetattr = unify_fsetattr, -}; - - -struct xlator_cbks cbks = { - .forget = unify_forget, -}; - -struct volume_options options[] = { - { .key = { "namespace" }, - .type = GF_OPTION_TYPE_XLATOR - }, - { .key = { "scheduler" }, - .value = { "alu", "rr", "random", "nufa", "switch" }, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"self-heal"}, - .value = { "foreground", "background", "off" }, - .type = GF_OPTION_TYPE_STR - }, - /* TODO: remove it some time later */ - { .key = {"optimist"}, - .type = GF_OPTION_TYPE_BOOL - }, - - { .key = {NULL} }, -}; diff --git a/xlators/cluster/unify/src/unify.h b/xlators/cluster/unify/src/unify.h deleted file mode 100644 index 3cfe725f43a..00000000000 --- a/xlators/cluster/unify/src/unify.h +++ /dev/null @@ -1,146 +0,0 @@ -/* - Copyright (c) 2006-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#ifndef _UNIFY_H -#define _UNIFY_H - -#include "scheduler.h" -#include "list.h" -#include "unify-mem-types.h" - -#define MAX_DIR_ENTRY_STRING (32 * 1024) - -#define ZR_UNIFY_SELF_HEAL_OFF 0 -#define ZR_UNIFY_FG_SELF_HEAL 1 -#define ZR_UNIFY_BG_SELF_HEAL 2 - -/* Sometimes one should use completely random numbers.. its good :p */ -#define UNIFY_SELF_HEAL_GETDENTS_COUNT 512 - -#define NS(xl) (((unify_private_t *)xl->private)->namespace) - -/* This is used to allocate memory for local structure */ -#define INIT_LOCAL(fr, loc) \ -do { \ - loc = GF_CALLOC (1, sizeof (unify_local_t), gf_unify_mt_unify_local_t); \ - ERR_ABORT (loc); \ - if (!loc) { \ - STACK_UNWIND (fr, -1, ENOMEM); \ - return 0; \ - } \ - fr->local = loc; \ - loc->op_ret = -1; \ - loc->op_errno = ENOENT; \ -} while (0) - - - -struct unify_private { - /* Update this structure depending on requirement */ - void *scheduler; /* THIS SHOULD BE THE FIRST VARIABLE, - if xlator is using scheduler */ - struct sched_ops *sched_ops; /* Scheduler options */ - xlator_t *namespace; /* ptr to namespace xlator */ - xlator_t **xl_array; - gf_boolean_t optimist; - int16_t child_count; - int16_t num_child_up; - uint8_t self_heal; - uint8_t is_up; - uint64_t inode_generation; - gf_lock_t lock; -}; -typedef struct unify_private unify_private_t; - -struct unify_self_heal_struct { - uint8_t dir_checksum[NAME_MAX]; - uint8_t ns_dir_checksum[NAME_MAX]; - uint8_t file_checksum[NAME_MAX]; - uint8_t ns_file_checksum[NAME_MAX]; - off_t *offset_list; - int *count_list; - dir_entry_t **entry_list; -}; - - -struct _unify_local_t { - int32_t call_count; - int32_t op_ret; - int32_t op_errno; - mode_t mode; - off_t offset; - dev_t dev; - uid_t uid; - gid_t gid; - int32_t flags; - int32_t entry_count; - int32_t count; // dir_entry_t count; - fd_t *fd; - struct iatt stbuf; - struct iatt stpre; - struct iatt stpost; - struct statvfs statvfs_buf; - struct timespec tv[2]; - char *name; - int32_t revalidate; - - ino_t ia_ino; - nlink_t ia_nlink; - - dict_t *dict; - - int16_t *list; - int16_t *new_list; /* Used only in case of rename */ - int16_t index; - - int32_t failed; - int32_t return_eio; /* Used in case of different st-mode - present for a given path */ - - uint64_t inode_generation; /* used to store the per directory - * inode_generation. Got from inode's ctx - * of directory inodes - */ - - struct unify_self_heal_struct *sh_struct; - loc_t loc1, loc2; - - struct iatt poststbuf; - /* When not used for rename, old* - * are used as the attrs for the current - * parent directory. - */ - struct iatt oldpreparent; - struct iatt oldpostparent; - struct iatt newpreparent; - struct iatt newpostparent; - int32_t wbflags; -}; -typedef struct _unify_local_t unify_local_t; - -int32_t zr_unify_self_heal (call_frame_t *frame, - xlator_t *this, - unify_local_t *local); - -#endif /* _UNIFY_H */ |
